In [7]:
from project_nader import *
from project_tiffany import get_suffix_array as gsa

In [2]:
from Bio import SeqIO
import numpy as np
import csv
from shared import *
from time import time
import numpy as np

In [3]:
def read_reads():
    """
    Reads all the RNA reads from reads.fa and returns a list of sequences as strings.

    :return: a list of RNA sequence reads as strings
    """
    max_id_len = 600
    reads = np.zeros((1575, 2), dtype='U{:d}'.format(max_id_len))
    i = 0
    for seq_record in SeqIO.parse("reads.fa", "fasta"):
        reads[i, 0] = seq_record.id
        reads[i, 1] = str(seq_record.seq)
        i += 1
    return reads

In [4]:
def read_genome():
    """
    Reads the genome sequence from genome.fa and return the genome sequence as a string.

    :return: a string of the genome sequence
    """
    genome = None
    for seq_record in SeqIO.parse("genome.fa", "fasta"):
        genome = str(seq_record.seq)
    return genome

In [5]:
def read_known_genes():
    """
    Reads the un/known genes, isoforms, and exons from genes.tab and constructs objects for each
    and return the list of constructed genes.

    :return: known_genes (a list of known Gene objects), unknown_genes (a list of unknown Gene objects)
    """
    known_genes, known_isoforms, known_exons = {}, {}, {}
    unknown_genes, unknown_isoforms, unknown_exons = {}, {}, {}
    with open("genes.tab") as tsv:
        for line in csv.reader(tsv, dialect="excel-tab"):
            name = line[0]
            if name == 'gene':
                known_genes[line[1]] = line[2].split(';')
            elif name == 'isoform':
                known_isoforms[line[1]] = line[2].split(';')
            elif name == 'exon':
                known_exons[line[1]] = Exon(line[1], int(line[2]), int(line[3]))
            elif name == 'unknown_gene':
                unknown_genes[line[1]] = line[2].split(';')
            elif name == 'unknown_isoform':
                unknown_isoforms[line[1]] = line[2].split(';')
            elif name == 'unknown_exon':
                unknown_exons[line[1]] = Exon(line[1], int(line[2]), int(line[3]))
    # Create the known Isoform objects
    for k in known_isoforms:
        known_isoforms[k] = Isoform(k, [known_exons[key] for key in known_isoforms[k]])
        
    # Create the UNknown Isoform objects
    for k in unknown_isoforms:
        unknown_isoforms[k] = Isoform(k, [unknown_exons[key] for key in unknown_isoforms[k]])

    # Create the known Genes objects
    for k in known_genes:
        known_genes[k] = Gene(k, [known_isoforms[key] for key in known_genes[k]])
        
    # Create the UNknown Genes objects
    for k in unknown_genes:
        unknown_genes[k] = Gene(k, [unknown_isoforms[key] for key in unknown_genes[k]])
    return known_genes, unknown_genes

In [6]:
# Main
reads = read_reads()
genome_sequence = read_genome()
known_genes, unknown_genes = read_known_genes()

In [8]:
genome = genome_sequence
genes = known_genes
s = genome_sequence[::-1] + TERMINATOR
sa = get_suffix_array(s)
L = get_bwt(s, sa)
F = get_F(L)
M = get_M(F)
occ = get_occ(L)

In [9]:
def replace_base_at_index(read_sequence, idx, base):
    length = len(read_sequence)
    return read_sequence[:idx] + base + read_sequence[idx + 1:]

In [None]:
def replace_bases_at_indices(read_sequence, _range, bases):
    length = len(read_sequence)
    return read_sequence[:_range[0]] + bases + read_sequence[range[1] + 1:]

In [14]:
def check_seeds(_range, length, high_bound=None):
    valid_seeds = []
    for i in range(_range[0], _range[1]):
        location = sa[i] + length
        if not high_bound or location < high_bound and MIN_INTRON_SIZE <= high_bound - location <= MAX_INTRON_SIZE:
            valid_seeds.append(len(genome_sequence) + 1 - location)
    return valid_seeds

In [11]:
def find_max_so_far(max_so_far, len_read, mismatches):
    if max_so_far[3] < mismatches and max_so_far[1] < len_read:
        updated = False
        old_max_so_far = max_so_far
        possible_changes = [b for b in BASES if b != max_so_far[2][len_read - max_so_far[1] - 1]]
        old_misses = misses = max_so_far[3]
        for change in possible_changes:
            new_read = replace_base_at_index(max_so_far[2], len_read - max_so_far[1] - 1, change)
            new_range, new_length = exact_suffix_matches(new_read, M, occ)
            if new_length > max_so_far[1]:
                max_so_far = (new_range, new_length, new_read, misses + 1)
                updated = True
                if max_so_far[1] >= len_read:
                    return True, max_so_far
        if updated:
            maxes = []
            for change in possible_changes:
                new_read = replace_base_at_index(old_max_so_far[2], len_read - old_max_so_far[1] - 1, change)
                new_range, new_length = exact_suffix_matches(new_read, M, occ)
                if new_length == max_so_far[1]:
                    maxes.append((new_range, new_length, new_read, old_misses + 1))
            print('      ' + 'new')
            for mx_so_far in maxes:
                print('      ' + str(mx_so_far))
                test, maxi = find_max_so_far(mx_so_far, len_read, mismatches)
                if test:
                    return test, maxi
        else:
            if max_so_far[1] != read_len:
                return False, ()
            else:
                return True, max_so_far
    elif max_so_far[3] > mismatches:
        return False, ()
    else:
        return True, max_so_far

In [12]:
def seed_finder(read_sequence, high_bound=None, mismatches=6, k=3):
    len_read, seeds_len, seeds = len(read_sequence), 0, []
    if k == 1:
        _range, length = exact_suffix_matches(read_sequence, M, occ)
        max_so_far, updated = (_range, length, read_sequence, 0), True
        
        test, max_so_far = find_max_so_far(max_so_far, len_read, mismatches)
        
        if max_so_far[1] == len_read:
            seeds = check_seeds(max_so_far[0], max_so_far[1], high_bound)
            if seeds:
                return True, (seeds, max_so_far[1])
            else:
                return False, ()
        return False, ()
#     else:
#         for i in range(len(read_sequence) - 1, -1, -1):
#             test_read = read_sequence[i:]
#             _range, length = exact_suffix_matches(test_read, M, occ)
#             if length == len(test_read):
                

In [15]:
reverse_read = reads[91][1][::-1]
# print('        ' + str(len(reverse_read)))
seedds = seed_finder(reverse_read, k=1)
print('        ' + str(253) + ": " + str(seedds))
# print('        ' + str(seedds[0] > seedds[1]))

      new
      ((1071416, 1071417), 42, 'CTATCGGCAATCTTCAAATAGTCAGAAAAGAAAGTGTGAAAGTTAAAGGA', 1)
      new
      ((8301542, 8301543), 49, 'CTATCGGGAATCTTCAAATAGTCAGAAAAGAAAGTGTGAAAGTTAAAGGA', 2)
        253: (True, ([2590478], 50))


In [16]:
sa_T = gsa(s)

In [52]:
num_misses, misses = 0, []
while num_misses < 2:
    for i in range(len(sa)):
        if sa[i] != sa_T[i]:
            num_misses += 1
            misses.append(i)
        if num_misses > 2:
            break

In [59]:
misses
sa[282117]

9041832

In [40]:
a = [s[i:i+200] for i in sa[282117:282120]]

In [41]:
b = [s[i:i+200] for i in sa_T[282117:282120]]

In [50]:
a[0][100:120]

'CACACAAAGGTTTGATGACG'

In [51]:
b[0][100:120]

'CTCACAAAGGTTTGACGACG'

In [39]:
for i in range(len(a[0])):
    if a[0][i] != b[0][i]:
        print('        '+ str(i))
        print('        '+ str(a[i]) + )
        break

        101
