In [11]:
import os
import glob
import pandas
import tempfile
from pathlib import Path
from Bio import SeqIO
from collections import defaultdict
from pygenomeviz import GenomeViz
from pygenomeviz.parser import Genbank
from pygenomeviz.align import AlignCoord
from pygenomeviz.utils import ColorCycler

In [12]:
# Lets start with some functions to process the contig, this will come from iterating through the align_coords.tsv dataframe
# and grouping by contig_id.

def process_assembly(homologies_file, syntenies_file, reference_gb, assembly_gb, output_dir, **kwargs):
    # process the alignments and spit out a plot for each contig :)
    gbk_assembly = list(SeqIO.parse(open(assembly_gb, 'r'), 'genbank'))
    gbk_reference = list(SeqIO.parse(open(reference_gb, 'r'), 'genbank'))
    assembly_id = Path(assembly_gb).stem
    reference_id = '.'.join(os.path.basename(reference_gb).split('.')[:-1])

    output_path = os.path.join(output_dir, assembly_id)
    png_path = os.path.join(output_path, 'png')
    html_path = os.path.join(output_path, 'html')

    check_n_make(output_path)
    check_n_make(png_path)
    check_n_make(html_path)

    homologies = pandas.read_csv(homologies_file, sep='\t')
    syntenies = pandas.read_csv(syntenies_file, sep = '\t')

    grouped_homologies = homologies.groupby('REF_NAME', ) # THIS WILL REQUIRE CHANGING IN THE FUTURE {{TO-DO: FIX FOR NON B31 CASE}}
    grouped_syntenies  = syntenies.groupby('REF_NAME', ) # THIS WILL REQUIRE CHANGING IN THE FUTURE {{TO-DO: FIX FOR NON B31 CASE}}

    for contig_id, contig_df in grouped_homologies:
        num_homologies = len(contig_df)
        if check_group_exists(grouped_syntenies, contig_id):
            num_syntenies = len(grouped_syntenies.get_group(contig_id))
        else:
            num_syntenies = 0
        print(f'Processing {contig_id}: homologies:{num_homologies}, syntenies:{num_syntenies}')
        if num_syntenies > 0:
            process_and_plot(contig_id, contig_df, grouped_syntenies, gbk_assembly, gbk_reference, assembly_id, reference_id, output_path)
        else:
            print(f"\tError plotting {contig_id}! No syntenies! Attempting to only plot homology!\n")
            process_and_plot(contig_id, contig_df, grouped_syntenies, gbk_assembly, gbk_reference, assembly_id, reference_id, output_path, homology_only=True)
    print(f"Finished plotting contigs for {assembly_id}! Have a wonderful day :)")

def process_and_plot(contig_id, contig_df, grouped_syntenies, gbk_assembly, gbk_reference, assembly_id, reference_id, output_path, **kwargs):
    # Process contig alignments for homology and synteny
    homology_only = kwargs.get('homology_only', False)
    # ok now let's parse this out,
    ref_homology_ids, homology_coords = process_contig_alignments(contig_df)
    if homology_only:
        reference_ids = ref_homology_ids
    else:
        ref_synteny_ids, synteny_coords = process_contig_alignments(grouped_syntenies.get_group(contig_id))
        reference_ids = ref_homology_ids.union(ref_synteny_ids)

    # Parse GenBank files for assembly and reference
    gbks = []
    gbk_asm = parse_split_genbank(gbk_assembly, contig_id, assembly_id)
    gbks.append(gbk_asm)
    gbk_ref = parse_split_genbank(gbk_reference, reference_ids, reference_id)
    gbks.append(gbk_ref)

    # Initialize GenomeViz
    gv = GenomeViz(track_align_type="center")
    gv.set_scale_bar()
    # Add tracks for each GenBank file
    track_number = 0
    for gbk in gbks:
        if track_number == 0:
            sublabel_pos = "top-center"
        else:
            sublabel_pos = "bottom-center"
        color = ColorCycler()
        #print(gbk.name)
        track = gv.add_feature_track(gbk.name, gbk.get_seqid2size(), space=0.05, label_kws=dict(color=color), align_label=False)
        for seqid, features in gbk.get_seqid2features(feature_type="CDS").items():
            segment = track.get_segment(seqid)

            segment.add_sublabel(f"{segment.name}: {segment.start:,}-{segment.end:,}bp", size = 8, pos = sublabel_pos, ymargin=0.3)
            for feature in features:
                segment.add_features(feature, fc="blue", lw=0.077, ignore_outside_range = True, plotstyle="arrow")

        track_number += 1
    # Add homology and synteny links
    if len(homology_coords) > 0:
        min_ident = int(min([ac.identity for ac in homology_coords if ac.identity]))
        color, inverted_color = "grey", "red"
        for ac in homology_coords:
            gv.add_link(ac.query_link, ac.ref_link, color=color, inverted_color=inverted_color, filter_length=250, curve=True, v=ac.identity, vmin=min_ident)
        gv.set_colorbar([color, inverted_color], vmin=min_ident, bar_label="Identity")
    if not homology_only:
        color, inverted_color = "blue", "purple"
        for sc in synteny_coords:
            gv.add_link(sc.query_link, sc.ref_link, color=color, inverted_color=inverted_color, curve=True)

    # Save the plot as PNG and HTML
    output_png = os.path.join(output_path, 'png', f'{contig_id}_synteny_plot.png')
    output_html = os.path.join(output_path, 'html', f'{contig_id}_synteny_plot.html')
    gv.savefig(output_png)
    gv.savefig_html(output_html)
    print(f'Finished plotting {contig_id}\n')

def process_contig_alignments(contig_df):
    #keys = ['QUERY_ID', 'QUERY_NAME', 'QUERY_START', 'QUERY_END', 'QUERY_LENGTH',
    #   'REF_ID', 'REF_NAME', 'REF_START', 'REF_END', 'REF_LENGTH', 'IDENTITY',
    #   'EVALUE']
    reference_ids = set()
    acs = []
    for index, row in contig_df.iterrows():
        row_dict = {k:v for k,v in row.items()}
        reference_ids.add(row_dict['QUERY_NAME']) # change this to query_id for all_v_all ? doesn't matter for B31
        acs.append(
            AlignCoord(
                row_dict['QUERY_ID'],
                row_dict['QUERY_NAME'],
                row_dict['QUERY_START'],
                row_dict['QUERY_END'],
                row_dict['REF_ID'],
                row_dict['REF_NAME'],
                row_dict['REF_START'],
                row_dict['REF_END'],
                row_dict['IDENTITY'],
            )
        )
    return reference_ids, acs

def parse_genbanks_for_contig(reference_ids):
    files = []
    for genbank_id in reference_ids:
        genbank_file = get_genbank_file(genbank_id)
        files.append(genbank_file)
    combined_temp_file = concatenate_genbank_files(files)
    gbk_ref = Genbank(combined_temp_file)
    return gbk_ref

def parse_split_genbank(records, record_ids, temp_file_name):
    temp_dir = tempfile.gettempdir()
    temp_file_path = os.path.join(temp_dir, temp_file_name)
    filtered_records = [record for record in records if record.id in record_ids]
    with open(temp_file_path, 'w') as outfile:
        SeqIO.write(filtered_records, outfile, 'genbank')
    gbk_ref = Genbank(temp_file_path, name=temp_file_name)
    return gbk_ref

def check_group_exists(grouped_df, group_name):
    return group_name in grouped_df.groups

def check_n_make(path):
    if not os.path.exists(path):
        os.makedirs(path)

def concatenate_genbank_files(file_list):
    # Create a temporary file
    temp_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.gb')

    with open(temp_file.name, 'w') as outfile:
        for file_name in file_list:
            with open(file_name, 'r') as infile:
                records = SeqIO.parse(infile, 'genbank')
                SeqIO.write(records, outfile, 'genbank')
    # Return the name of the temporary file
    return temp_file.name

def get_genbank_file(genbank_id, where_to_look):
    file_path = os.path.join(where_to_look, f'{genbank_id}.gbff')
    if os.path.exists(file_path):
        return genbank_file
    else:
        raise FileNotFoundError(f"The file at path '{file_path}' does not exist.")

def make_track_from_gb(gv, track_id, lens, features):
    track = gv.add_feature_track(track_id, lens)
    track.add_sublabel()
    track.add_features(features)
    return track


In [13]:
cwd = os.getcwd()
test_output_dir = os.path.join(cwd,'script_testing_output')

In [14]:
assembly_genbanks_dir = '/home/mf019/borrelia_plasmid_classifier_v3/assemblies/longread/annotation'
assembly_genbanks = glob.glob(f'{assembly_genbanks_dir}/**/*.gbff', recursive=True)

In [15]:
test_homology_file = 'asms_vs_B31_v2/ESI26H/align_coords.tsv'
test_homology = pandas.read_csv(test_homology_file, sep = '\t', )
test_homology = test_homology.drop(['EVALUE'], axis=1)

test_synteny_file = 'asms_vs_B31_prot/ESI26H/align_coords.tsv'
test_synteny = pandas.read_csv(test_synteny_file, sep = '\t', )
test_synteny = test_synteny.drop(['EVALUE'], axis=1)

test_assembly_genbank = '/home/mf019/borrelia_plasmid_classifier_v3/assemblies/longread/annotation/ESI26H/ESI26H.gbff'
b31_genbank = 'renamed_GCF_000008685.2.gbff'

In [16]:
fig = process_assembly(test_homology_file, test_synteny_file,
                       b31_genbank, test_assembly_genbank, test_output_dir)

Processing contig000001: homologies:3, syntenies:746
Finished plotting contig000001

Processing contig000002: homologies:12, syntenies:54
Finished plotting contig000002

Processing contig000003: homologies:15, syntenies:31
Finished plotting contig000003

Processing contig000004: homologies:2, syntenies:26
Finished plotting contig000004

Processing contig000005: homologies:3, syntenies:29
Finished plotting contig000005

Processing contig000006: homologies:1, syntenies:1
Finished plotting contig000006

Processing contig000007: homologies:6, syntenies:9
Finished plotting contig000007

Processing contig000008: homologies:7, syntenies:14
Finished plotting contig000008

Processing contig000009: homologies:3, syntenies:23
Finished plotting contig000009

Processing contig000010: homologies:7, syntenies:13
Finished plotting contig000010

Processing contig000011: homologies:2, syntenies:18
Finished plotting contig000011

Processing contig000012: homologies:4, syntenies:8
Finished plotting contig

In [17]:
assembly_genbanks_dir = '/home/mf019/borrelia_plasmid_classifier_v3/assemblies/longread/annotation'
assembly_genbanks = glob.glob(f'{assembly_genbanks_dir}/**/*.gbff', recursive=True)
b31_genbank = 'renamed_GCF_000008685.2.gbff'

In [18]:
for assembly in assembly_genbanks:
    asm_id = Path(assembly).stem
    print(asm_id)
    homologies = f'asms_vs_B31_v2/{asm_id}/align_coords.tsv'
    syntenies = f'asms_vs_B31_prot/{asm_id}/align_coords.tsv'
    print(os.path.exists(homologies))
    print(os.path.exists(syntenies))
    cwd = os.getcwd()
    output_dir = os.path.join(cwd, 'asm_v_b31_subplots')
    print(output_dir)
    process_assembly(homologies, syntenies,
                       b31_genbank, assembly, output_dir)

URI87H
True
True
/home/mf019/longread_pangenome/synteny/asm_v_b31_subplots/URI87H
Processing contig000001: homologies:3, syntenies:746
Finished plotting contig000001

Processing contig000002: homologies:1, syntenies:53
Finished plotting contig000002

Processing contig000003: homologies:1, syntenies:23
Finished plotting contig000003

Processing contig000004: homologies:1, syntenies:24
Finished plotting contig000004

Processing contig000005: homologies:2, syntenies:22
Finished plotting contig000005

Processing contig000006: homologies:1, syntenies:9
Finished plotting contig000006

Processing contig000007: homologies:1, syntenies:14
Finished plotting contig000007

Processing contig000008: homologies:3, syntenies:13
Finished plotting contig000008

Processing contig000009: homologies:3, syntenies:26
Finished plotting contig000009

Processing contig000010: homologies:2, syntenies:31
Finished plotting contig000010

Processing contig000011: homologies:2, syntenies:39
Finished plotting contig00