In [15]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
import subprocess
import os
import re
import sys
import pandas as pd

In [16]:
## The following functions are copied wholesale from CheckV as I find them to be fast and correct
## link https://bitbucket.org/berkeleylab/checkv/src/master/checkv/modules/complete_genomes.py

def fetch_dtr(fullseq, min_length=20):
    startseq = fullseq[0:min_length]
    # find index positions of all matches of startseq in fullseq
    # only keep matches occuring in 2nd half of string
    matches = [
        m.start() for m in re.finditer("(?={0})".format(re.escape(startseq)), fullseq)
    ]
    matches = [_ for _ in matches if _ >= len(fullseq) / 2]
    for matchpos in matches:
        # determine if the match extends to the contig end
        endseq = fullseq[matchpos:]
        if fullseq[0 : len(endseq)] == endseq:
            return endseq
    return ""

def reverse_complement(seq):
    if sys.version_info > (3, 0):
        trans = str.maketrans("ACTG", "TGAC")
    else:
        trans = string.maketrans("ACTG", "TGAC")
    return seq[::-1].translate(trans)

def fetch_itr(seq, min_len=20, max_len=1000):
    rev = reverse_complement(seq)
    # see if minimal substring occurs at end
    if seq[:min_len] == rev[:min_len]:
        # extend to maximum substring, up to <max_len>
        i = min_len + 1
        while seq[:i] == rev[:i] and i <= max_len:
            i += 1
        return seq[: i - 1]
    # no match
    else:
        return ""

In [23]:
fasta_file = "/Users/u241374/mike_tisza/sandbox/test123_SRS893334.fna"

output_file = "/Users/u241374/mike_tisza/sandbox/trimmed_circles1.fna"

if os.path.isfile(output_file):
    os.remove(output_file)

terminal_r_list = []
for seq_record in SeqIO.parse(fasta_file, "fasta"):
    if not len(seq_record.seq) >= 1000:
        continue
    dtr_seq = fetch_dtr(str(seq_record.seq))

    if not dtr_seq:
        dtr_seq = "NA"

    itr_seq = fetch_itr(str(seq_record.seq))

    if not itr_seq:
        itr_seq = "NA"

    if not dtr_seq == "NA":
        print(f">{seq_record.id}", file = open(output_file, "a"))
        print(seq_record.seq[:-len(dtr_seq)], file = open(output_file, "a"))

        terminal_r_list.append([seq_record.id, len(seq_record.seq), len(seq_record.seq[:-len(dtr_seq)]), dtr_seq, itr_seq])
    else:
        print(f">{seq_record.id}", file = open(output_file, "a"))
        print(seq_record.seq, file = open(output_file, "a"))

        terminal_r_list.append([seq_record.id, len(seq_record.seq), len(seq_record.seq[:-len(dtr_seq)]), dtr_seq, itr_seq])

terminal_df = pd.DataFrame(terminal_r_list, columns=["contig", "in_length_contig", "out_length_contig", "dtr_seq", "itr_seq"])

print(terminal_df)

                     contig  in_length_contig  out_length_contig dtr_seq  \
0                         1              6486               6484      NA   
1                         2              4110               4108      NA   
2                         3              4425               4423      NA   
3                         4              3390               3388      NA   
4                         5              3562               3560      NA   
...                     ...               ...                ...     ...   
6470  contig-100_1004.51897              1002               1000      NA   
6471  contig-100_1005.51898              1002               1000      NA   
6472  contig-100_1006.51899              1002               1000      NA   
6473  contig-100_1007.51900              1001                999      NA   
6474  contig-100_1008.51901              1000                998      NA   

     itr_seq  
0         NA  
1         NA  
2         NA  
3         NA  
4         NA