In [68]:
import plotly
from Bio import SeqIO
import numpy as np
import subprocess
import plotly.graph_objects as go
import pandas as pd

In [181]:
def get_command(seq1, seq2):
    # Use temporary files for delta and filtered delta
    return f"nucmer --maxmatch -nosimplify {seq1} {seq2}"

def run_command(cmd):
    try:
        print(f'Running MUMmer with command: {cmd}')
        result = subprocess.run(cmd, shell=True, check=True, text=True, capture_output=True)
        print('Success!')
        return result
    except subprocess.CalledProcessError as e:
        print(f'Error running command: {e}')
        print(f'Command output (stdout): {e.stdout}')
        print(f'Command error (stderr): {e.stderr}')
        return None

def parse_delta(delta_content):
    alignments = []
    current_ref = ""
    current_query = ""
    for line in delta_content.strip().split('\n'):
        if line.startswith('>'):
            parts = line.split()
            current_ref = parts[0][1:]
            current_query = parts[1]
            ref_len = int(parts[2])
            query_len = int(parts[3])
        elif line[0].isdigit():
            parts = line.split()
            if len(parts) == 7:
                ref_start, ref_end = int(parts[0]), int(parts[1])
                query_start, query_end = int(parts[2]), int(parts[3])
                alignments.append({
                    'ref_name': current_ref,
                    'query_name': current_query,
                    'ref_start': ref_start,
                    'ref_end': ref_end,
                    'query_start': query_start,
                    'query_end': query_end,
                    'ref_len': ref_len,
                    'query_len': query_len
                })
    return pd.DataFrame(alignments)

def calculate_offsets(df):
    ref_lengths = df.groupby('ref_name')['ref_len'].first()
    query_lengths = df.groupby('query_name')['query_len'].first()

    ref_cum_lengths = ref_lengths.cumsum()
    query_cum_lengths = query_lengths.cumsum()

    ref_offsets = {name: ref_cum_lengths[name] - length for name, length in ref_lengths.items()}
    query_offsets = {name: query_cum_lengths[name] - length for name, length in query_lengths.items()}

    return ref_offsets, query_offsets, ref_cum_lengths, query_cum_lengths

def create_dotplot(df, output_file='mummer_style_dotplot.html'):
    ref_offsets, query_offsets, ref_cum_lengths, query_cum_lengths = calculate_offsets(df)

    fig = go.Figure()

    for _, row in df.iterrows():
        ref_offset = ref_offsets[row['ref_name']]
        query_offset = query_offsets[row['query_name']]
        
        x_start = query_offset + row['query_start']
        x_end = query_offset + row['query_end']
        y_start = ref_offset + row['ref_start']
        y_end = ref_offset + row['ref_end']

        color = 'red' if row['query_end'] >= row['query_start'] else 'blue'

        fig.add_trace(go.Scatter(
            x=[x_start, x_end],
            y=[y_start, y_end],
            mode='lines',
            line=dict(color=color, width=1),
            showlegend=False
        ))

    fig.update_layout(
        title='MUMmer-style Dotplot',
        xaxis_title='Query Sequence',
        yaxis_title='Reference Sequence',
        width=800,
        height=800,
        xaxis=dict(
            range=[0, query_cum_lengths.max()],
            tickmode='array',
            tickvals=list(query_cum_lengths),
            ticktext=list(query_cum_lengths.index),
            tickangle=45
        ),
        yaxis=dict(
            range=[0, ref_cum_lengths.max()],
            tickmode='array',
            tickvals=list(ref_cum_lengths),
            ticktext=list(ref_cum_lengths.index),
            scaleanchor='x',
            scaleratio=1
        )
    )

    fig.write_html(output_file)
    print(f"MUMmer-style dotplot saved as {output_file}")

In [164]:
seq1 = "/home/mf019/borrelia_plasmid_classifier_v3/assemblies/URI88H.fasta"
cmd = get_command(seq1, seq1)

In [168]:
run_command(cmd)

Running MUMmer with command: nucmer --maxmatch -nosimplify /home/mf019/borrelia_plasmid_classifier_v3/assemblies/URI88H.fasta /home/mf019/borrelia_plasmid_classifier_v3/assemblies/URI88H.fasta
Success!


CompletedProcess(args='nucmer --maxmatch -nosimplify /home/mf019/borrelia_plasmid_classifier_v3/assemblies/URI88H.fasta /home/mf019/borrelia_plasmid_classifier_v3/assemblies/URI88H.fasta', returncode=0, stdout='', stderr='1: PREPARING DATA\n2,3: RUNNING mummer AND CREATING CLUSTERS\n# reading input file "out.ntref" of length 1540635\n# construct suffix tree for sequence of length 1540635\n# (maximum reference length is 2305843009213693948)\n# (maximum query length is 18446744073709551615)\n# process 15406 characters per dot\n#....................................................................................................\n# CONSTRUCTIONTIME /home/mf019/miniconda3/opt/mummer-3.23/mummer out.ntref 0.69\n# reading input file "/home/mf019/borrelia_plasmid_classifier_v3/assemblies/URI88H.fasta" of length 1540634\n# matching query-file "/home/mf019/borrelia_plasmid_classifier_v3/assemblies/URI88H.fasta"\n# against subject-file "out.ntref"\n# COMPLETETIME /home/mf019/miniconda3/opt/mummer

In [182]:
with open('out.delta', 'r') as infile:
    delta_content = infile.read()
    df = parse_delta(delta_content)

In [183]:
print(df)

          ref_name    query_name  ref_start  ref_end  query_start  query_end  \
0     contig000001  contig000001          1   906672            1     906672   
1     contig000001  contig000001     206151   206234       206246     206325   
2     contig000001  contig000001     206246   206325       206151     206234   
3     contig000001  contig000001     213015   213810       213501     214297   
4     contig000001  contig000001     213016   213324       213988     214297   
...            ...           ...        ...      ...          ...        ...   
2099  contig000021  contig000032       3040     3168          129          1   
2100  contig000033  contig000033          1      123            1        123   
2101  contig000029  contig000033          1      113           11        123   
2102  contig000005  contig000033      28282    28403          122          1   
2103  contig000021  contig000033       1475     1574          100          1   

      ref_len  query_len  
0      90667

In [184]:
create_dotplot(df)

MUMmer-style dotplot saved as mummer_style_dotplot.html
