In [22]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
import subprocess
import os
import re
import sys

In [23]:
## The following functions are copied wholesale from CheckV as I find them to be fast and correct
## link https://bitbucket.org/berkeleylab/checkv/src/master/checkv/modules/complete_genomes.py

def fetch_dtr(fullseq, min_length=20):
    startseq = fullseq[0:min_length]
    # find index positions of all matches of startseq in fullseq
    # only keep matches occuring in 2nd half of string
    matches = [
        m.start() for m in re.finditer("(?={0})".format(re.escape(startseq)), fullseq)
    ]
    matches = [_ for _ in matches if _ >= len(fullseq) / 2]
    for matchpos in matches:
        # determine if the match extends to the contig end
        endseq = fullseq[matchpos:]
        if fullseq[0 : len(endseq)] == endseq:
            return endseq
    return ""

def reverse_complement(seq):
    if sys.version_info > (3, 0):
        trans = str.maketrans("ACTG", "TGAC")
    else:
        trans = string.maketrans("ACTG", "TGAC")
    return seq[::-1].translate(trans)

def fetch_itr(seq, min_len=20, max_len=1000):
    rev = reverse_complement(seq)
    # see if minimal substring occurs at end
    if seq[:min_len] == rev[:min_len]:
        # extend to maximum substring, up to <max_len>
        i = min_len + 1
        while seq[:i] == rev[:i] and i <= max_len:
            i += 1
        return seq[: i - 1]
    # no match
    else:
        return ""

In [26]:
fasta_file = "/Users/u241374/mike_tisza/sandbox/test123_SRS893334.fna"


for seq_record in SeqIO.parse(fasta_file, "fasta"):
    dtr_seq = fetch_dtr(str(seq_record.seq))

    if not dtr_seq:
        dtr_seq = "NA"

    itr_seq = fetch_itr(str(seq_record.seq))

    if not itr_seq:
        itr_seq = "NA"
    if not dtr_seq == "NA" or not itr_seq == "NA":
        print(seq_record.id, dtr_seq, itr_seq)

565 AACTCAGTGAGTTGAACCCACACATCACAAAGTAGTTTCTGAGAATCATTCTGTCTAGTTTTTCTATGAAGATATTGCCTTTTCCACCATAGGCCTCAA NA
918 GATTATTTGTTGTCGTACATTTATTGAGCCCCTTTTTAGGGGCGGCACTTCCGAGACGTAAGGCAAATGCTCGCGCTGAAAGTGCCGCTCCTGAAGGAGCTCGTATAACTTTTCGTTTTGTGTGTACCCAGGGGTTCATATTGGGCTTTGCCCTTATTCCACCCTGGGCTATGCAGGAGTGTCGCCCTAAAAGGGCTCATCTCGGCAAGTGCTAAGCGTGT NA
1287 NA GTGATTAGCATTGGAATCATTCTACTTGCTAGCCTCCATATCATTACAAGACCAAGTCAAGCAAGTGAAACCGACTCCTTCTATCA
1344 CCTTTCCAGATTGTTTTCTAATTGATAGAAATGGGATTGTGAAAAGAATTATGAAAGATTAGTGCTTTAATTAAATAGTGAATCATATTAATCAAAATAGTTTGATTTTTATAATTGGTAGCCTGCCTATGAA NA
1364 TACTGTAACGTCACCTGTCTTAGGATCTTGCTCAACATCTACTGTTGGTGTCTTTTGAGCTGGTGTTGTCACATCCACT NA
2743 CCGAAGAAGAAGCTAAGATTACAGCAGCTGACACAGCAGAAGCAGTAGCAACAGCGAAAACAGCAGGTGTAACAGCAATCGAAGGCGTGCACACACCAGGTAACTTAGATACTGTAAAAGAAGCAGCCAAAGCAGACTTAGCCAAAGCAGCGCAAGCAGAAAAAGAAGCAATCGCAGCAGATAAGAGCTTAACAGCAGCCCAACGTACAG NA
3260 TCCTGTCTCACTCTTGAGGGACATCCTCTCCTCCGCTCACAGGTGGACAGACTCCCTGGATCTTTTGGCTGTAACGAATGTCAGGAAACAAAGGGACTG NA
3799 GTGAGCC