In [126]:
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import os
import subprocess
import pandas as pd
from io import StringIO
from Comparative_Analysis import Utilities as util
import math
from tqdm.auto import tqdm
import re

In [98]:
project_dir = 'D:/Project_Data/Project_6'
genome_datasets_dir = project_dir + '/Datasets/NCBI_Datasets'
temp_fileloc = project_dir + '/Temp_Files'

In [99]:
def write_fasta(sequence, name, file):
    line_length = 60
    lines = []
    sequence_length = len(sequence)
    number_of_lines = math.ceil(sequence_length / line_length)
    lines.append(">" + name + "\n")
    for i in range(number_of_lines):
            subsequence = sequence[i*line_length:(i+1)*line_length]
            lines.append(subsequence + "\n")
    a = ''.join(lines)
    with open(file,'w', newline='') as outfile:
        outfile.write(''.join(lines))

In [100]:
id_list = ['GCF_000195955.2', 'GCF_000157895.3']

In [109]:
for id in tqdm(id_list):
    genome_record = next(SeqIO.parse(genome_datasets_dir + '/'+id + '/genomic.gbff', "genbank"))
    organism_name = genome_record.annotations['organism']
    full_sequence = genome_record.seq
    write_fasta(str(full_sequence), id, temp_fileloc + '/'+id+'.fasta')

  0%|          | 0/2 [00:00<?, ?it/s]

In [110]:
subprocess.run('wsl cd ~; cd mummer4/mummer-4.0.0rc1; promer -p promer '+util.wslname(temp_fileloc + '/'+id_list[0]+'.fasta ')+ util.wslname(temp_fileloc + '/'+id_list[1]+'.fasta ') , shell=True)

CompletedProcess(args='wsl cd ~; cd mummer4/mummer-4.0.0rc1; promer -p promer /mnt/d/Project_Data/Project_6/Temp_Files/GCF_000195955.2.fasta /mnt/d/Project_Data/Project_6/Temp_Files/GCF_000157895.3.fasta ', returncode=0)

In [111]:
temp = subprocess.run('wsl cd ~; cd mummer4/mummer-4.0.0rc1; show-coords -r -c -l -L 100 -I 50 -T promer.delta' , shell=True, capture_output=True).stdout.decode('utf-8')

In [112]:
column_names =[ 'S1', 'E1', 'S2', 'E2', 'LEN 1', 'LEN 2', '% IDY', '% SIM', '% STP', 'LEN R', 'LEN Q', 'COV R', 'COV Q', 'FRM_1', 'FRM_2', 'TAGS_1', 'TAGS_2']
dataframe = pd.read_table(StringIO(temp), skiprows=4, index_col=False, header=None, names=column_names)

In [114]:
dataframe.to_csv(project_dir + '/mummer_coords.csv')

In [121]:
temp = subprocess.run('wsl cd ~; cd mummer4/mummer-4.0.0rc1; show-aligns promer.delta '+id_list[0]+' '+ id_list[1] , shell=True, capture_output=True).stdout.decode('utf-8')


In [131]:
def extract_ref_and_query_seq(cleaned_lines):
    return re.findall('[0-9]+\s+(.*)\n[0-9]+\s+(.*)', cleaned_lines)[0]

def clean_alignment_string(alignment_string):
    # The line consisting of spaces, |'s, and numbers above each reference-query sequence pair in show-aligns output
    seqs_overline_regex = '[\n]?[\s0-9\|]+\n'
    # The line beneath each reference-query sequence pair in show-aligns output pointing to mismatches with ^'s
    seqs_underline_regex = '\n[\s\^]+[\n]?'
    # We REALLY need those question marks, or otherwise the matching will delete the first and last lines

    cleaned_seqs = re.findall('(?s)'+seqs_overline_regex+'(.*?)'+seqs_underline_regex, alignment_string)

    extracted_sequences = [extract_ref_and_query_seq(line) for line in cleaned_seqs]
    extracted_sequences = list(zip(*extracted_sequences))
    extracted_sequences = [''.join(extracted_sequence).upper() for extracted_sequence in extracted_sequences]
    ref_sequence, query_sequence = tuple(extracted_sequences)

    return ref_sequence, query_sequence


In [134]:
def get_alignments_from_ids():
    alignments = subprocess.run('wsl cd ~; cd mummer4/mummer-4.0.0rc1; show-aligns promer.delta '+id_list[0]+' '+ id_list[1] , shell=True, capture_output=True).stdout.decode('utf-8')
    #alignments = subprocess.run(['show-aligns', delta_filename, ref_id, query_id],
    #                              stdout=subprocess.PIPE).stdout.decode('utf-8')
    # Note that no sorting is done by default for the output of `show-aligns`, so we _may_ assume
    # that the order of the matches is the same as their order of appearance in the deltafile

    # "Beginning delimiter" of every alignment in the `show-aligns` output
    begin_alignment_regex = '-- BEGIN alignment \[ (?P<ref_direction>[+\-])1 (?P<ref_start>[0-9]+) - (?P<ref_end>[0-9]+) \|' + \
    ' (?P<query_direction>[+\-])1 (?P<query_start>[0-9]+) - (?P<query_end>[0-9]+) \]\n\n'
    # "End delimiter" of every alignment in the `show-aligns` output
    end_alignment_regex = '\n\n--\s+END alignment \[ [+\-]1 [0-9]+ - [0-9]+ \| [+\-]1 [0-9]+ - [0-9]+ \]'

    # Goal is to capture everything between the begin alignment strings and the end alignment strings
    parse_regex = '(?s)'+begin_alignment_regex+'(?P<alignment_string>.*?)'+end_alignment_regex
    # FYI:    have to use (?s) at beginning to ensure '.' will also match new lines
    # See:    https://stackoverflow.com/questions/42302482/python-find-a-string-between-two-strings-repeatedly#comment116031644_42302556
    parsed_alignments = [match.groupdict() for match in re.finditer(parse_regex, alignments)]   

    # Now have a DataFrame with the first columns containing information about the
    # alignment and the last column containing everything in between one of the
    # "BEGIN alignment" and "END alignment" blocks of the `show-aligns` output
    parsed_alignments = pd.DataFrame(parsed_alignments)

    alignment_strings = list(parsed_alignments['alignment_string']) 

    parsed_alignments = parsed_alignments.drop(columns=['alignment_string'])

    #ref_sequences, query_sequences = tuple(zip(*[clean_alignment_string(alignment_string) for alignment_string in alignment_strings]))

    #parsed_alignments['ref_sequence'] = ref_sequences
    #parsed_alignments['query_sequence'] = query_sequences
    return parsed_alignments

In [135]:
get_alignments_from_ids()

Unnamed: 0,ref_direction,ref_start,ref_end,query_direction,query_start,query_end
0,+,3265,4989,+,3242887,3244611
1,+,10882,11187,+,3255322,3255639
2,-,4400219,4393398,-,3224284,3217454
3,+,12313,13026,+,3258442,3259155
4,+,23770,25464,+,3269860,3271620
...,...,...,...,...,...,...
1020,-,23774,14877,-,3269860,3260888
1021,+,4388917,4391250,+,3211987,3214359
1022,-,9818,7686,-,3251716,3249587
1023,-,7268,5175,-,3246904,3244811
