# Reading the Data

In [1]:
from Bio import SeqIO
import numpy as np
import csv
from shared import *
from project_nader import *
from time import time

In [2]:
def read_reads():
    """
    Reads all the RNA reads from reads.fa and returns a list of sequences as strings.

    :return: a list of RNA sequence reads as strings
    """
    max_id_len = 600
    reads = np.zeros((1575, 2), dtype='U{:d}'.format(max_id_len))
    i = 0
    for seq_record in SeqIO.parse("reads.fa", "fasta"):
        reads[i, 0] = seq_record.id
        reads[i, 1] = str(seq_record.seq)
        i += 1
    return reads

In [3]:
def read_genome():
    """
    Reads the genome sequence from genome.fa and return the genome sequence as a string.

    :return: a string of the genome sequence
    """
    genome = None
    for seq_record in SeqIO.parse("genome.fa", "fasta"):
        genome = str(seq_record.seq)
    return genome

In [4]:
def read_known_genes():
    """
    Reads the un/known genes, isoforms, and exons from genes.tab and constructs objects for each
    and return the list of constructed genes.

    :return: known_genes (a list of known Gene objects), unknown_genes (a list of unknown Gene objects)
    """
    known_genes, known_isoforms, known_exons = {}, {}, {}
    unknown_genes, unknown_isoforms, unknown_exons = {}, {}, {}
    with open("genes.tab") as tsv:
        for line in csv.reader(tsv, dialect="excel-tab"):
            name = line[0]
            if name == 'gene':
                known_genes[line[1]] = line[2].split(';')
            elif name == 'isoform':
                known_isoforms[line[1]] = line[2].split(';')
            elif name == 'exon':
                known_exons[line[1]] = Exon(line[1], int(line[2]), int(line[3]))
            elif name == 'unknown_gene':
                unknown_genes[line[1]] = line[2].split(';')
            elif name == 'unknown_isoform':
                unknown_isoforms[line[1]] = line[2].split(';')
            elif name == 'unknown_exon':
                unknown_exons[line[1]] = Exon(line[1], int(line[2]), int(line[3]))
    # Create the known Isoform objects
    for k in known_isoforms:
        known_isoforms[k] = Isoform(k, [known_exons[key] for key in known_isoforms[k]])
        
    # Create the UNknown Isoform objects
    for k in unknown_isoforms:
        unknown_isoforms[k] = Isoform(k, [unknown_exons[key] for key in unknown_isoforms[k]])

    # Create the known Genes objects
    for k in known_genes:
        known_genes[k] = Gene(k, [known_isoforms[key] for key in known_genes[k]])
        
    # Create the UNknown Genes objects
    for k in unknown_genes:
        unknown_genes[k] = Gene(k, [unknown_isoforms[key] for key in unknown_genes[k]])
    return known_genes, unknown_genes

# Alignment Algorithm Class

In [5]:
MIN_INTRON_SIZE = 20
MAX_INTRON_SIZE = 10000


class Aligner:
    def __init__(self, genome_sequence, known_genes):
        """
        Initializes the aligner. Do all time intensive set up here. i.e. build suffix array.

        genome_sequence: a string (NOT TERMINATED BY '$') representing the bases of the of the genome
        known_genes: a python set of Gene objects (see shared.py) that represent known genes. You can get the isoforms
                     and exons from a Gene object

        Time limit: 500 seconds maximum on the provided data. Note that our server is probably faster than your machine,
                    so don't stress if you are close. Server is 1.25 times faster than the i7 CPU on my computer
        """
        self.sa = get_suffix_array(genome_sequence)
        L = get_bwt(genome_sequence, self.sa)
        self.occ, self.M = get_occ(L), get_M(get_F(L))
        self.known_genes = known_genes

    def align(self, read_sequence):
        """
        Returns an alignment to the genome sequence. An alignment is a list of pieces.
        Each piece consists of a start index in the read, a start index in the genome, and a length
        indicating how many bases are aligned in this piece. Note that mismatches are count as "aligned".

        Note that <read_start_2> >= <read_start_1> + <length_1>. If your algorithm produces an alignment that
        violates this, we will remove pieces from your alignment arbitrarily until consecutive pieces
        satisfy <read_start_2> >= <read_start_1> + <length_1>

        Return value must be in the form (also see the project pdf):
        [(<read_start_1>, <reference_start_1, length_1), (<read_start_2>, <reference_start_2, length_2), ...]

        If no good matches are found: return the best match you can find or return []

        Time limit: 0.5 seconds per read on average on the provided data.
        """
        # for gene in self.known_genes:

        pass

# Testing

In [6]:
# Main
reads = read_reads()
genome_sequence = read_genome()
known_genes, unknown_genes = read_known_genes()

In [7]:
print("length of the genome sequence: " + str(len(genome_sequence) + 1))

t = -time()
"""
aligner = Aligner(genome_sequence, known_genes)
t += time()
"""
# Initialization
sa = get_suffix_array(genome_sequence + '$')
L = get_bwt(genome_sequence, sa)
occ, M = get_occ(L), get_M(get_F(L))
# self.known_genes = known_genes

print("time to run Aligner.__init__: " + str(t + time()))

length of the genome sequence: 10949001
time to run Aligner.__init__: 854.515557050705


In [8]:
t = -time()
known_transcriptome = {}
for gene in known_genes:
    known_transcriptome[gene] = {}
    for iso in known_genes[gene].isoforms:
        isoform, exons, total_len = '', [], 0
        for ex in iso.exons:
            isoform += genome_sequence[ex.start: ex.end]
            exons.append((total_len, ex.start, ex.end - ex.start))
            total_len += ex.end - ex.start
        known_transcriptome[gene][iso] = [isoform, exons]
print("time to find the known transcriptome: " + str(t + time()))

time to find the known transcriptome: 2.007504940032959


In [9]:
t = -time()
unknown_transcriptome = {}
for gene in unknown_genes:
    unknown_transcriptome[gene] = {}
    for iso in unknown_genes[gene].isoforms:
        isoform, exons, total_len = '', [], 0
        for ex in iso.exons:
            isoform += genome_sequence[ex.start: ex.end]
            exons.append((total_len, ex.start, ex.end - ex.start))
            total_len += ex.end - ex.start
        unknown_transcriptome[gene][iso] = [isoform, exons]
print("time to find the UNknown transcriptome: " + str(t + time()))

time to find the UNknown transcriptome: 0.0005719661712646484


In [10]:
num_isoforms = 0
for gene in known_transcriptome:
    num_isoforms += len(known_transcriptome[gene])
print('number of known isoforms:', num_isoforms)

number of known isoforms: 127


In [11]:
num_isoforms = 0
for gene in unknown_transcriptome:
    num_isoforms += len(unknown_transcriptome[gene])
print('number of UNknown isoforms:', num_isoforms)

number of UNknown isoforms: 9


In [12]:
def find_alignment(read_len, align_start, exons):
    def find_start_location(lo, hi):
        mid = (lo + hi) // 2
        if exons[mid][0] <= align_start < exons[mid][0] + exons[mid][2]:
            return mid
        elif exons[mid][0] + exons[mid][2] <= align_start:
            return find_start_location(mid+1, hi)
        else:
            return find_start_location(lo, mid-1)
    idx = find_start_location(0, len(exons))
    algn = [(0, exons[idx][1] + (align_start - exons[idx][0]),
             read_len if read_len + (align_start - exons[idx][0]) <= exons[idx][2]
             else exons[idx][2] - (align_start - exons[idx][0]))]
    read_len -= exons[idx][2] - (align_start - exons[idx][0])
    while read_len > 0:
        idx += 1
        algn.append((0, exons[idx][1] + (align_start - exons[idx][0]),
                     read_len if read_len + (align_start - exons[idx][0]) <= exons[idx][2]
                     else exons[idx][2] - (align_start - exons[idx][0])))
        read_len -= exons[idx][2] - (align_start - exons[idx][0])
    return algn

In [13]:
def align_to_isoform(read, isoform, exons):
    match = (-1, float("inf"))
    for i in range(len(isoform) - len(read) + 1):
        j, mismatches = 0, 0
        while j < len(read):
            if isoform[i+j] != read[j]:
                mismatches += 1
            if mismatches > 6:
                break
            j += 1
        if j == len(read) and mismatches < match[1]:
            match = (i, mismatches)
    return match if match[0] == -1 else (find_alignment(len(read), match[0], exons), match[1])

In [None]:
def align_to_transcriptome(read, transcriptome):
    match = (-1, float("inf"))
    for gene in transcriptome:
        for iso in transcriptome[gene]:
            alignment, mismatches = align_to_isoform(read, transcriptome[gene][iso][0], transcriptome[gene][iso][1])
            if alignment != -1 and mismatches <= match[1]:
                match = (alignment, mismatches)
    return match

In [14]:
reads[0][1]

'ATTACTCTTGGGAATGAAATCCTATCTATATAAGCTGTGGTTTGAAATCC'

In [None]:
rand_iso

In [None]:
t = -time()
# align_to_isoform(reads[166][1], reads[166][1])
align_to_transcriptome(reads[150][1], known_transcriptome)

In [None]:
unable_to_match = []
for i in range(len(reads)):
    start, mismatches = align_to_transcriptome(reads[i][1], known_transcriptome)
    if start == -1:
        unable_to_match.append(i)
        print(str(i) + ' not matched!')

In [None]:
still_unable_to_match = []
for i in range(len(unable_to_match)):
    start, mismatches = align_to_transcriptome(reads[i][1], unknown_transcriptome)
    if start != -1:
        print(str(i) + ' matched to an unknown gene!')
    else:
        still_unable_to_match.append(i)

In [22]:
t = -time()
print('         ' + str(exact_suffix_matches(reads[0][1][:-38], M, occ)))
print("time to find the read in the genome: " + str(t + time()))

         ((3152211, 3152212), 12)
time to find the read in the genome: 0.0007557868957519531


In [None]:
"""
Use exact_suffix_matches but need to find consecutive matches with possible mismatches and restricted intron length
"""
def align_to_genome(read):
    # TODO
    pass