# Reading the Data

In [1]:
from Bio import SeqIO
import numpy as np
import csv
from shared import *
from project_nader import *
from time import time

In [2]:
def read_reads():
    """
    Reads all the RNA reads from reads.fa and returns a list of sequences as strings.

    :return: a list of RNA sequence reads as strings
    """
    max_id_len = 600
    reads = np.zeros((1575, 2), dtype='U{:d}'.format(max_id_len))
    i = 0
    for seq_record in SeqIO.parse("reads.fa", "fasta"):
        reads[i, 0] = seq_record.id
        reads[i, 1] = str(seq_record.seq)
        i += 1
    return reads

In [3]:
def read_genome():
    """
    Reads the genome sequence from genome.fa and return the genome sequence as a string.

    :return: a string of the genome sequence
    """
    genome = None
    for seq_record in SeqIO.parse("genome.fa", "fasta"):
        genome = str(seq_record.seq)
    return genome

In [4]:
def read_known_genes():
    """
    Reads the un/known genes, isoforms, and exons from genes.tab and constructs objects for each
    and return the list of constructed genes.

    :return: known_genes (a list of known Gene objects), unknown_genes (a list of unknown Gene objects)
    """
    known_genes, known_isoforms, known_exons = {}, {}, {}
    unknown_genes, unknown_isoforms, unknown_exons = {}, {}, {}
    with open("genes.tab") as tsv:
        for line in csv.reader(tsv, dialect="excel-tab"):
            name = line[0]
            if name == 'gene':
                known_genes[line[1]] = line[2].split(';')
            elif name == 'isoform':
                known_isoforms[line[1]] = line[2].split(';')
            elif name == 'exon':
                known_exons[line[1]] = Exon(line[1], int(line[2]), int(line[3]))
            elif name == 'unknown_gene':
                unknown_genes[line[1]] = line[2].split(';')
            elif name == 'unknown_isoform':
                unknown_isoforms[line[1]] = line[2].split(';')
            elif name == 'unknown_exon':
                unknown_exons[line[1]] = Exon(line[1], int(line[2]), int(line[3]))
    # Create the known Isoform objects
    for k in known_isoforms:
        known_isoforms[k] = Isoform(k, [known_exons[key] for key in known_isoforms[k]])
        
    # Create the UNknown Isoform objects
    for k in unknown_isoforms:
        unknown_isoforms[k] = Isoform(k, [unknown_exons[key] for key in unknown_isoforms[k]])

    # Create the known Genes objects
    for k in known_genes:
        known_genes[k] = Gene(k, [known_isoforms[key] for key in known_genes[k]])
        
    # Create the UNknown Genes objects
    for k in unknown_genes:
        unknown_genes[k] = Gene(k, [unknown_isoforms[key] for key in unknown_genes[k]])
    return known_genes, unknown_genes

# Alignment Algorithm Class

In [5]:
MIN_INTRON_SIZE = 20
MAX_INTRON_SIZE = 10000


class Aligner:
    def __init__(self, genome_sequence, known_genes):
        """
        Initializes the aligner. Do all time intensive set up here. i.e. build suffix array.

        genome_sequence: a string (NOT TERMINATED BY '$') representing the bases of the of the genome
        known_genes: a python set of Gene objects (see shared.py) that represent known genes. You can get the isoforms
                     and exons from a Gene object

        Time limit: 500 seconds maximum on the provided data. Note that our server is probably faster than your machine,
                    so don't stress if you are close. Server is 1.25 times faster than the i7 CPU on my computer
        """
        self.sa = get_suffix_array(genome_sequence)
        L = get_bwt(genome_sequence, self.sa)
        self.occ, self.M = get_occ(L), get_M(get_F(L))
        self.known_genes = known_genes

    def align(self, read_sequence):
        """
        Returns an alignment to the genome sequence. An alignment is a list of pieces.
        Each piece consists of a start index in the read, a start index in the genome, and a length
        indicating how many bases are aligned in this piece. Note that mismatches are count as "aligned".

        Note that <read_start_2> >= <read_start_1> + <length_1>. If your algorithm produces an alignment that
        violates this, we will remove pieces from your alignment arbitrarily until consecutive pieces
        satisfy <read_start_2> >= <read_start_1> + <length_1>

        Return value must be in the form (also see the project pdf):
        [(<read_start_1>, <reference_start_1, length_1), (<read_start_2>, <reference_start_2, length_2), ...]

        If no good matches are found: return the best match you can find or return []

        Time limit: 0.5 seconds per read on average on the provided data.
        """
        # for gene in self.known_genes:

        pass

# Testing

In [6]:
# Main
reads = read_reads()
genome_sequence = read_genome()
known_genes, unknown_genes = read_known_genes()

In [7]:
print("length of the genome sequence: " + str(len(genome_sequence) + 1))

t = -time()
"""
aligner = Aligner(genome_sequence, known_genes)
t += time()
"""
# Initialization
sa = get_suffix_array(genome_sequence + '$')
L = get_bwt(genome_sequence, sa)
occ, M = get_occ(L), get_M(get_F(L))
# self.known_genes = known_genes

print("time to run Aligner.__init__: " + str(t + time()))

length of the genome sequence: 10949001
time to run Aligner.__init__: 326.63758087158203


In [8]:
t = -time()
known_transcriptome = {}
for gene in known_genes:
    known_transcriptome[gene] = {}
    for iso in known_genes[gene].isoforms:
        isoform, exons, total_len = '', [], 0
        for ex in iso.exons:
            isoform += genome_sequence[ex.start: ex.end]
            exons.append((total_len, ex.start, ex.end - ex.start))
            total_len += ex.end - ex.start
        known_transcriptome[gene][iso] = [isoform, exons]
print("time to find the known transcriptome: " + str(t + time()))

time to find the known transcriptome: 0.8037879467010498


In [9]:
t = -time()
unknown_transcriptome = {}
for gene in unknown_genes:
    unknown_transcriptome[gene] = {}
    for iso in unknown_genes[gene].isoforms:
        isoform, exons, total_len = '', [], 0
        for ex in iso.exons:
            isoform += genome_sequence[ex.start: ex.end]
            exons.append((total_len, ex.start, ex.end - ex.start))
            total_len += ex.end - ex.start
        unknown_transcriptome[gene][iso] = [isoform, exons]
print("time to find the UNknown transcriptome: " + str(t + time()))

time to find the UNknown transcriptome: 0.0003437995910644531


In [10]:
num_isoforms = 0
for gene in known_transcriptome:
    num_isoforms += len(known_transcriptome[gene])
print('number of known isoforms:', num_isoforms)

number of known isoforms: 127


In [11]:
num_isoforms = 0
for gene in unknown_transcriptome:
    num_isoforms += len(unknown_transcriptome[gene])
print('number of UNknown isoforms:', num_isoforms)

number of UNknown isoforms: 9


In [62]:
def find_alignment(read_len, align_start, exons):
    def find_start_location(lo, hi):
        mid = (lo + hi) // 2
        if exons[mid][0] <= align_start < exons[mid][0] + exons[mid][2]:
            return mid
        elif exons[mid][0] + exons[mid][2] <= align_start:
            return find_start_location(mid+1, hi)
        else:
            return find_start_location(lo, mid-1)
    idx = find_start_location(0, len(exons))
    algn = [(0, exons[idx][1] + (align_start - exons[idx][0]),
             read_len if read_len + (align_start - exons[idx][0]) <= exons[idx][2]
             else exons[idx][2] - (align_start - exons[idx][0]))]
    read_len -= exons[idx][2] - (align_start - exons[idx][0])
    while read_len > 0:
        idx += 1
        algn.append((algn[-1][0] + algn[-1][2], exons[idx][1],
                     read_len if read_len <= exons[idx][2]
                     else exons[idx][2]))
        read_len -= exons[idx][2]
    return algn

In [65]:
def align_to_isoform(read, isoform, exons):
    match = (-1, float("inf"))
    for i in range(len(isoform) - len(read) + 1):
        j, mismatches = 0, 0
        while j < len(read):
            if isoform[i+j] != read[j]:
                mismatches += 1
            if mismatches > MAX_NUM_MISMATCHES:
                break
            j += 1
        if j == len(read) and mismatches < match[1]:
            match = (i, mismatches)
    return match if match[0] == -1 else (find_alignment(len(read), match[0], exons), match[1])

In [14]:
def align_to_transcriptome(read, transcriptome):
    match = (-1, float("inf"))
    for gene in transcriptome:
        for iso in transcriptome[gene]:
            alignment, mismatches = align_to_isoform(read, transcriptome[gene][iso][0], transcriptome[gene][iso][1])
            if alignment != -1 and mismatches <= match[1]:
                match = (alignment, mismatches)
    return match

In [15]:
reads[0][1]

'ATTACTCTTGGGAATGAAATCCTATCTATATAAGCTGTGGTTTGAAATCC'

In [16]:
rand_iso

NameError: name 'rand_iso' is not defined

In [19]:
t = -time()
# align_to_isoform(reads[166][1], reads[166][1])
align_to_transcriptome(reads[150][1], known_transcriptome)


      0.41244983673095703


In [20]:
unable_to_match = []
avg_t, num_matches = 0, 0
for i in range(len(reads)):
    t = -time()
    start, mismatches = align_to_transcriptome(reads[i][1], known_transcriptome)
    if start == -1:
        unable_to_match.append(i)
        print(str(i) + ' not matched!')
    else:
        tt = time() + t
        print("      " + str(tt))
        avg_t += tt
        num_matches += 1

      0.4247000217437744
      0.42728090286254883
2 not matched!
3 not matched!
      0.41390490531921387
      0.4067389965057373
      0.3950479030609131
      0.40930986404418945
8 not matched!
      0.40666794776916504
      0.4186549186706543
      0.4017040729522705
      0.4010610580444336
      0.4059622287750244
      0.4019961357116699
      0.4021797180175781
      0.4129977226257324
      0.5382959842681885
      0.4143638610839844
      0.4086019992828369
      0.40575218200683594
      0.4063720703125
      0.4402010440826416
      0.40795373916625977
      0.41429805755615234
      0.4101247787475586
      0.4171762466430664
      0.4086148738861084
      0.41223597526550293
29 not matched!
      0.4057309627532959
      0.4100680351257324
      0.4138338565826416
      0.40763115882873535
      0.40746593475341797
      0.40997791290283203
      0.41702723503112793
      0.47773003578186035
      0.4250328540802002
      0.47082090377807617
      0.49718689918518066
  

      0.4159998893737793
      0.4063081741333008
      0.39157891273498535
      0.40276217460632324
      0.40596604347229004
      0.39348411560058594
      0.394819974899292
      0.40634775161743164
      0.40615391731262207
      0.4074831008911133
      0.3929007053375244
      0.4189779758453369
      0.40229225158691406
      0.39583706855773926
      0.3950169086456299
      0.3942117691040039
      0.40226221084594727
      0.39606189727783203
      0.3986940383911133
      0.4076039791107178
      0.41323018074035645
      0.41347193717956543
      0.4356238842010498
      0.46440792083740234
      0.4051079750061035
358 not matched!
      0.4139690399169922
      0.5364196300506592
      0.6018819808959961
      0.40328502655029297
      0.4414498805999756
      0.42417120933532715
      0.45027613639831543
      0.4453151226043701
      0.528209924697876
      0.40586018562316895
      0.42369675636291504
      0.44725608825683594
      0.43433213233947754
      0.4832320

      0.39301419258117676
      0.4027581214904785
      0.40460872650146484
      0.39864563941955566
      0.4096109867095947
      0.4027440547943115
      0.3986990451812744
      0.40561890602111816
      0.43325376510620117
      0.42627787590026855
      0.41867804527282715
      0.4027540683746338
      0.41164088249206543
      0.4185647964477539
      0.4076077938079834
      0.4187009334564209
      0.4308762550354004
      0.41495180130004883
      0.4352271556854248
      0.41945314407348633
      0.4021329879760742
      0.41216611862182617
      0.41202211380004883
      0.44038867950439453
      0.4083540439605713
      0.40613675117492676
      0.41754817962646484
      0.43224287033081055
      0.43003201484680176
      0.4442272186279297
      0.4853696823120117
      0.49390578269958496
      0.5486891269683838
      0.4041111469268799
      0.42435598373413086
      0.4535398483276367
      0.4173438549041748
      0.46270298957824707
      0.4716672897338867
     

      0.7477619647979736
      0.5183649063110352
      0.5163249969482422
      0.6855900287628174
      0.8879270553588867
      0.8409948348999023
      0.6839501857757568
      0.5215067863464355
      0.5181832313537598
      0.5148370265960693
      0.4669830799102783
      0.4601869583129883
      0.48242902755737305
      0.40375804901123047
      0.40723395347595215
1014 not matched!
1015 not matched!
      0.4154961109161377
      0.4066770076751709
      0.4004700183868408
      0.4114530086517334
      0.41798973083496094
      0.42052388191223145
      0.41070079803466797
      0.436939001083374
      0.4156627655029297
      0.41590404510498047
      0.4174478054046631
      0.39517974853515625
      0.4040060043334961
      0.4097940921783447
      0.40329408645629883
      0.4048349857330322
      0.3913888931274414
      0.4377269744873047
      0.5247921943664551
1035 not matched!
      0.4151930809020996
      0.4427220821380615
      0.49922704696655273
      0.4501

      0.5370380878448486
      0.4467909336090088
      0.44773292541503906
      0.4417867660522461
      0.450009822845459
      0.42934107780456543
      0.42724180221557617
      0.47118115425109863
      0.5480630397796631
      0.5516695976257324
      0.47721290588378906
      1.0044550895690918
      0.7661089897155762
      0.7865140438079834
      0.5135340690612793
1347 not matched!
      0.6890358924865723
1349 not matched!
      0.7781200408935547
      0.6820578575134277
      0.6334209442138672
      0.44152307510375977
      0.6973330974578857
      0.4407937526702881
      0.4955251216888428
      0.9743680953979492
      0.45625782012939453
      0.7253222465515137
      0.5594658851623535
1361 not matched!
      0.4835829734802246
      0.48328113555908203
      0.5910348892211914
      0.5955209732055664
      0.5721127986907959
      0.7023961544036865
      0.487699031829834
1369 not matched!
      0.5662860870361328
      0.5227007865905762
      0.71299719810485

In [55]:
print('    Average time: ' + str(avg_t / num_matches))

    Average time: 0.45989171990445066


In [24]:
print(unable_to_match)

[2, 3, 8, 29, 61, 84, 91, 104, 135, 154, 163, 167, 173, 179, 200, 201, 213, 218, 253, 268, 271, 276, 281, 293, 300, 322, 323, 358, 390, 392, 393, 413, 415, 430, 438, 444, 451, 454, 467, 475, 484, 499, 501, 506, 510, 528, 568, 570, 571, 576, 609, 618, 625, 641, 648, 650, 657, 659, 707, 710, 719, 729, 745, 753, 760, 779, 786, 827, 836, 856, 898, 905, 907, 920, 927, 936, 937, 956, 959, 967, 979, 985, 1014, 1015, 1035, 1050, 1062, 1080, 1083, 1085, 1086, 1094, 1100, 1107, 1119, 1134, 1142, 1150, 1153, 1161, 1181, 1193, 1196, 1212, 1224, 1229, 1232, 1252, 1253, 1266, 1276, 1281, 1320, 1323, 1325, 1347, 1349, 1361, 1369, 1381, 1388, 1391, 1409, 1427, 1441, 1442, 1446, 1451, 1463, 1471, 1477, 1486, 1517, 1520, 1547, 1550, 1556, 1567, 1571]


In [None]:
still_unable_to_match = []
for i in range(len(unable_to_match)):
    start, mismatches = align_to_transcriptome(reads[i][1], unknown_transcriptome)
    if start != -1:
        print(str(i) + ' matched to an unknown gene!')
    else:
        still_unable_to_match.append(i)

In [22]:
t = -time()
print('         ' + str(exact_suffix_matches(reads[0][1][:-38], M, occ)))
print("time to find the read in the genome: " + str(t + time()))

         ((3152211, 3152212), 12)
time to find the read in the genome: 0.0007557868957519531


In [None]:
"""
Use exact_suffix_matches but need to find consecutive matches with possible mismatches and restricted intron length
"""
def align_to_genome(read):
    # TODO
    pass

# Evaluation

In [25]:
from evaluation import *

In [47]:
known_isoforms = []
for gene in known_genes:
    known_isoforms.extend(known_genes[gene].isoforms)

In [48]:
unknown_isoforms = []
for gene in unknown_genes:
    unknown_isoforms.extend(unknown_genes[gene].isoforms)

In [52]:
genome_isoform_offsets = index_isoform_locations(known_isoforms, unknown_isoforms)

In [53]:
read_sequence = reads[150][1]
alignment = align_to_transcriptome(reads[150][1], known_transcriptome)[0]
alignment

[(0, 6759550, 50)]

In [64]:
CASE_GENE

'gene'

In [67]:
unable_to_match = []
for i in range(len(reads)):
    read_sequence = reads[i][1]
    alignment, mismatches = align_to_transcriptome(read_sequence, known_transcriptome)
    if alignment == -1:
        unable_to_match.append(i)
        print('      ', str(i) + ' not matched!')
    else:
        case, _ = evaluate_alignment(genome_sequence, read_sequence, alignment, unknown_isoforms, genome_isoform_offsets)
        if case != CASE_GENE:
            print('      ', i)
            print('      ', case, 'alignment')
            print('      ', alignment, read_sequence)

       2 not matched!
       3 not matched!
       8 not matched!
       29 not matched!
       61 not matched!
       84 not matched!
       91 not matched!
       104 not matched!
       135 not matched!
       154 not matched!
       163 not matched!
       167 not matched!
       173 not matched!
       179 not matched!
       200 not matched!
       201 not matched!
       213 not matched!
       218 not matched!
       253 not matched!
       268 not matched!
       271 not matched!
       276 not matched!
       281 not matched!
       293 not matched!
       300 not matched!
       322 not matched!
       323 not matched!
       358 not matched!
       390 not matched!
       392 not matched!
       393 not matched!
       413 not matched!
       415 not matched!
       430 not matched!
       438 not matched!
       444 not matched!
       451 not matched!
       454 not matched!
       467 not matched!
       475 not matched!
       484 not matched!
       499 not matched!
  

In [68]:
print('size of unmatched: ', len(unable_to_match))
print('unmatched read indices:', unable_to_match)

size of unmatched:  139
unmatched read indices: [2, 3, 8, 29, 61, 84, 91, 104, 135, 154, 163, 167, 173, 179, 200, 201, 213, 218, 253, 268, 271, 276, 281, 293, 300, 322, 323, 358, 390, 392, 393, 413, 415, 430, 438, 444, 451, 454, 467, 475, 484, 499, 501, 506, 510, 528, 568, 570, 571, 576, 609, 618, 625, 641, 648, 650, 657, 659, 707, 710, 719, 729, 745, 753, 760, 779, 786, 827, 836, 856, 898, 905, 907, 920, 927, 936, 937, 956, 959, 967, 979, 985, 1014, 1015, 1035, 1050, 1062, 1080, 1083, 1085, 1086, 1094, 1100, 1107, 1119, 1134, 1142, 1150, 1153, 1161, 1181, 1193, 1196, 1212, 1224, 1229, 1232, 1252, 1253, 1266, 1276, 1281, 1320, 1323, 1325, 1347, 1349, 1361, 1369, 1381, 1388, 1391, 1409, 1427, 1441, 1442, 1446, 1451, 1463, 1471, 1477, 1486, 1517, 1520, 1547, 1550, 1556, 1567, 1571]
