In [4]:
import sys
import bisect
from collections import defaultdict

from io import StringIO
from math import ceil, log2

import exact_string_matching_algorithms as alg

In [5]:
# filter_real_alignments
pattern = "ATC"
possible_alignments = [0, 8, 9, 10]
text = "ACGTGCTGAAAT"
real_alignments = alg.filter_real_alignments(pattern, possible_alignments, text)
assert real_alignments == []

text = "ATCTGCTGAAAT"
real_alignments = alg.filter_real_alignments(pattern, possible_alignments, text)
assert real_alignments == [0]

pattern = "ATC"
possible_alignments = [0, 3, 6, 9]
text = "ATCATCACTATC"
real_alignments = alg.filter_real_alignments(pattern, possible_alignments, text)

assert real_alignments == [0, 3, 9]

In [6]:
# parse_fasta
string_io = StringIO("")
parsed_fasta = alg.parse_fasta(string_io)
assert parsed_fasta == {}

string_io = StringIO("test_sequence")
parsed_fasta = alg.parse_fasta(string_io)
assert parsed_fasta == {}

string_io = StringIO(">test_sequence")
parsed_fasta = alg.parse_fasta(string_io)
assert parsed_fasta["test_sequence"] == ""

string_io = StringIO(">test_sequence\nACTG")
parsed_fasta = alg.parse_fasta(string_io)
assert parsed_fasta["test_sequence"] == "ACTG"

string_io = StringIO(">test_sequence\nACTG")
parsed_fasta = alg.parse_fasta(string_io)
assert parsed_fasta["test_sequence"] == "ACTG"

string_io = StringIO(">test_sequence1\nACTG\n>test_sequence2\nCTGA")
parsed_fasta = alg.parse_fasta(string_io)
assert parsed_fasta["test_sequence1"] == "ACTG"
assert parsed_fasta["test_sequence2"] == "CTGA"

In [7]:
# choose_algorithm
# redundant, already tested in function
choice = alg.choose_algorithm()
assert (choice >= 1 and choice <= 4)

Exact String Matching Algorithms

Choose algorithm:
1. Sorted Index
2. Hash Table
3. Suffix Array
4. Suffix Tree
4

You chose the Suffix Tree algorithm.


In [8]:
# init_structure
# check created structure type
alg.choice = 1
struct = alg.init_structure('...', 'SOME TEXT', 5)
assert type(struct) is alg.IndexSorted

alg.choice = 2
struct = alg.init_structure('...', 'SOME TEXT', 5)
assert type(struct) is alg.IndexHash

alg.choice = 3
struct = alg.init_structure('...', 'SOME TEXT', 5)
assert type(struct) is alg.SuffixArray

alg.choice = 4
struct = alg.init_structure('...', 'SOME TEXT', 5)
assert type(struct) is alg.SuffixTree

alg.choice = 5
struct = alg.init_structure('...', 'SOME TEXT', 5)
assert type(struct) is type(None)

In [9]:
# create_indexes
# all algorithms

for alg.choice in range(1, 5):
    string_io = StringIO("")
    alg.parsed_fasta = alg.parse_fasta(string_io)
    alg.create_indexes()
    assert len(alg.indexes) == 0

    string_io = StringIO(">test_sequence")
    alg.parsed_fasta = alg.parse_fasta(string_io)
    alg.create_indexes()
    assert len(alg.indexes) == 1

    string_io = StringIO(">test_sequence1\nACTG\n>test_sequence2\nCTGA")
    alg.parsed_fasta = alg.parse_fasta(string_io)
    alg.create_indexes()
    assert len(alg.indexes) == 2

Creating index for sequence test_sequence
Creating index for sequence test_sequence2
Creating index for sequence test_sequence1
Creating index for sequence test_sequence
Creating index for sequence test_sequence2
Creating index for sequence test_sequence1
Creating index for sequence test_sequence
Creating index for sequence test_sequence2
Creating index for sequence test_sequence1
Creating index for sequence test_sequence
Creating index for sequence test_sequence2
Creating index for sequence test_sequence1


In [11]:
# query
# all algorithms
for alg.choice in range(1, 5):
    string_io = StringIO(">test_sequence\nACGTAACTGTAACGGTAC")
    alg.parsed_fasta = alg.parse_fasta(string_io)
    alg.create_indexes()
    if alg.choice in [1, 2]: # SortedIndex and HashTable
        assert len(alg.indexes[0].query("A")) == 0 # query doesn't work for strings smaller than five characters
        assert len(alg.indexes[0].query("GTAACTG")) == 2 # potentially two times, but really only once
    elif alg.choice in [3, 4]: # SuffixArray and SuffixTree
        assert len(alg.indexes[0].query("A")) == 6
        assert len(alg.indexes[0].query("GTAACTG")) == 1
    assert len(alg.indexes[0].query("ACGTA")) == 1
    assert len(alg.indexes[0].query("ACGTAACTGTAACGGTAC")) == 1

Creating index for sequence test_sequence
Creating index for sequence test_sequence
Creating index for sequence test_sequence
Creating index for sequence test_sequence


In [None]:
# Some functions just encapsulate other function calls
# main
# file_processing
# per_pattern_processing