# Reading the Data

In [14]:
from Bio import SeqIO
import numpy as np
import csv
from shared import *
from project_nader import *
from time import time

In [2]:
def read_reads():
    """
    Reads all the RNA reads from reads.fa and returns a list of sequences as strings.

    :return: a list of RNA sequence reads as strings
    """
    max_id_len = 60
    reads = np.zeros((1575, 2), dtype='S{:d}'.format(max_id_len))
    i = 0
    for seq_record in SeqIO.parse("reads.fa", "fasta"):
        reads[i, 0] = seq_record.id
        reads[i, 1] = str(seq_record.seq)
        i += 1
    return reads

In [3]:
def read_genome():
    """
    Reads the genome sequence from genome.fa and return the genome sequence as a string.

    :return: a string of the genome sequence
    """
    genome = None
    for seq_record in SeqIO.parse("genome.fa", "fasta"):
        genome = str(seq_record.seq)
    return genome

In [4]:
def read_known_genes():
    """
    Reads the un/known genes, isoforms, and exons from genes.tab and constructs objects for each
    and return the list of constructed genes.

    :return: a list of known Gene objects
    """
    genes, isoforms, exons = {}, {}, {}
    with open("genes.tab") as tsv:
        for line in csv.reader(tsv, dialect="excel-tab"):
            name = line[0].split('_')[-1]
            if name == 'gene':
                genes[line[1]] = line[2].split(';')
            elif name == 'isoform':
                isoforms[line[1]] = line[2].split(';')
            elif name == 'exon':
                exons[line[1]] = Exon(line[1], int(line[2]), int(line[3]))
    # Create the Isoform objects
    for k in isoforms:
        isoforms[k] = Isoform(k, [exons[key] for key in isoforms[k]])

    # Create the Genes objects
    for k in genes:
        genes[k] = Gene(k, [isoforms[key] for key in genes[k]])
    return genes

# Alignment Algorithm Class

In [5]:
MIN_INTRON_SIZE = 20
MAX_INTRON_SIZE = 10000


class Aligner:
    def __init__(self, genome_sequence, known_genes):
        """
        Initializes the aligner. Do all time intensive set up here. i.e. build suffix array.

        genome_sequence: a string (NOT TERMINATED BY '$') representing the bases of the of the genome
        known_genes: a python set of Gene objects (see shared.py) that represent known genes. You can get the isoforms
                     and exons from a Gene object

        Time limit: 500 seconds maximum on the provided data. Note that our server is probably faster than your machine,
                    so don't stress if you are close. Server is 1.25 times faster than the i7 CPU on my computer
        """
        self.sa = get_suffix_array(genome_sequence)
        L = get_bwt(genome_sequence, self.sa)
        self.occ, self.M = get_occ(L), get_M(get_F(L))
        self.known_genes = known_genes

    def align(self, read_sequence):
        """
        Returns an alignment to the genome sequence. An alignment is a list of pieces.
        Each piece consists of a start index in the read, a start index in the genome, and a length
        indicating how many bases are aligned in this piece. Note that mismatches are count as "aligned".

        Note that <read_start_2> >= <read_start_1> + <length_1>. If your algorithm produces an alignment that
        violates this, we will remove pieces from your alignment arbitrarily until consecutive pieces
        satisfy <read_start_2> >= <read_start_1> + <length_1>

        Return value must be in the form (also see the project pdf):
        [(<read_start_1>, <reference_start_1, length_1), (<read_start_2>, <reference_start_2, length_2), ...]

        If no good matches are found: return the best match you can find or return []

        Time limit: 0.5 seconds per read on average on the provided data.
        """
        # for gene in self.known_genes:

        pass

# Testing

In [15]:
# Main
reads = read_reads()
genome_sequence = read_genome()
known_genes = read_known_genes()

In [16]:
print("length of the genome sequence: " + str(len(genome_sequence) + 1))
"""
t = -time()
aligner = Aligner(genome_sequence, known_genes)
t += time()
"""
# Initialization
sa = get_suffix_array(genome_sequence + '$')
L = get_bwt(genome_sequence, sa)
occ, M = get_occ(L), get_M(get_F(L))
# self.known_genes = known_genes

print("time to run Aligner.__init__: " + str(t))

length of the genome sequence: 10949000
time to run Aligner.__init__: 356.30846524238586


In [29]:
t = -time()
transcriptome = {key: [''.join([genome_sequence[ex.start: ex.end] for ex in iso.exons]) for iso in known_genes[key].isoforms]
                 for key in known_genes}
print("time to find the transcriptome: " + str(t + time()))

time to find the transcriptome: 0.0005118846893310547


In [46]:
for key in transcriptome:
    if len(transcriptome[key]) > 1:
        print(key, len(transcriptome[key]))

ENSG00000233997 2
ENSG00000155313 10
ENSG00000154654 5
ENSG00000185272 6
ENSG00000228798 2
ENSG00000232560 2
ENSG00000224905 3
ENSG00000243064 6
ENSG00000215386 20
ENSG00000244676 2
ENSG00000224309 4
ENSG00000154645 7
ENSG00000280594 2
ENSG00000154639 5
ENSG00000166351 2


In [47]:
len(known_genes['ENSG00000155313'].isoforms)

10