In [1]:
from project import *

In [2]:
from Bio import SeqIO
import numpy as np
import csv
from shared import *
from time import time
import numpy as np

In [3]:
def read_reads():
    """
    Reads all the RNA reads from reads.fa and returns a list of sequences as strings.

    :return: a list of RNA sequence reads as strings
    """
    max_id_len = 600
    reads = np.zeros((1575, 2), dtype='U{:d}'.format(max_id_len))
    i = 0
    for seq_record in SeqIO.parse("reads.fa", "fasta"):
        reads[i, 0] = seq_record.id
        reads[i, 1] = str(seq_record.seq)
        i += 1
    return reads

In [4]:
def read_genome():
    """
    Reads the genome sequence from genome.fa and return the genome sequence as a string.

    :return: a string of the genome sequence
    """
    genome = None
    for seq_record in SeqIO.parse("genome.fa", "fasta"):
        genome = str(seq_record.seq)
    return genome

In [5]:
def read_known_genes():
    """
    Reads the un/known genes, isoforms, and exons from genes.tab and constructs objects for each
    and return the list of constructed genes.

    :return: known_genes (a list of known Gene objects), unknown_genes (a list of unknown Gene objects)
    """
    known_genes, known_isoforms, known_exons = {}, {}, {}
    unknown_genes, unknown_isoforms, unknown_exons = {}, {}, {}
    with open("genes.tab") as tsv:
        for line in csv.reader(tsv, dialect="excel-tab"):
            name = line[0]
            if name == 'gene':
                known_genes[line[1]] = line[2].split(';')
            elif name == 'isoform':
                known_isoforms[line[1]] = line[2].split(';')
            elif name == 'exon':
                known_exons[line[1]] = Exon(line[1], int(line[2]), int(line[3]))
            elif name == 'unknown_gene':
                unknown_genes[line[1]] = line[2].split(';')
            elif name == 'unknown_isoform':
                unknown_isoforms[line[1]] = line[2].split(';')
            elif name == 'unknown_exon':
                unknown_exons[line[1]] = Exon(line[1], int(line[2]), int(line[3]))
    # Create the known Isoform objects
    for k in known_isoforms:
        known_isoforms[k] = Isoform(k, [known_exons[key] for key in known_isoforms[k]])
        
    # Create the UNknown Isoform objects
    for k in unknown_isoforms:
        unknown_isoforms[k] = Isoform(k, [unknown_exons[key] for key in unknown_isoforms[k]])

    # Create the known Genes objects
    for k in known_genes:
        known_genes[k] = Gene(k, [known_isoforms[key] for key in known_genes[k]])
        
    # Create the UNknown Genes objects
    for k in unknown_genes:
        unknown_genes[k] = Gene(k, [unknown_isoforms[key] for key in unknown_genes[k]])
    return known_genes, unknown_genes

In [9]:
# Main
reads = read_reads()
genome_sequence = read_genome()
known_genes, unknown_genes = read_known_genes()
known_genes_list = []
for gene in known_genes:
    known_genes_list.append(known_genes[gene])

In [10]:
t = -time()
aligner = Aligner(genome_sequence, known_genes_list)
time() + t

392.3921148777008

# Evaluation

In [11]:
from evaluation import *

In [12]:
known_isoforms = []
for gene in known_genes:
    known_isoforms.extend(known_genes[gene].isoforms)

In [13]:
unknown_isoforms = []
for gene in unknown_genes:
    unknown_isoforms.extend(unknown_genes[gene].isoforms)

In [14]:
genome_isoform_offsets = index_isoform_locations(known_isoforms, unknown_isoforms)

In [15]:
read_sequence = reads[150][1]
alignment = aligner.align(reads[150][1])
alignment

[(0, 6759550, 50)]

In [None]:
total_time = 0
for i in range(len(reads)):
    read_sequence = reads[i][1]
    t = -time()
    alignment = aligner.align(read_sequence)
    total_time += time() + t
    print('      ', i)
    if alignment == -1:
        unable_to_match.append(i)
        print('      ', str(i) + ' not matched!')
    else:
        case, _ = evaluate_alignment(genome_sequence, read_sequence, alignment, unknown_isoforms, genome_isoform_offsets)
        if case != CASE_GENE:
            print('      ', i)
            print('      ', case, 'alignment')
            print('      ', alignment, read_sequence)

       0
       1
       2
       2
       unaligned alignment
       [] GTGACTTAGCTGCAAAACCTACTGCGAGTGATGCCGTGGCTCCGATAGGT
       3
       3
       unaligned alignment
       [] TCTCGGGGTGAATACCTCTTATCGCGATACCTCCGGGGACTAGTGCGCCA
       4
       5
       6
       7
       8
       8
       unaligned alignment
       [] TAATCAGAAGGTGGATCAACTGGAAGATGTGCCTCCTCCAAAGAGCCGTA
       9
       10
       11
       12
       13
       14
       15
       16
       17
       18
       19
       20
       21
       22
       23
       24
       25
       26
       27
       28
       29
       29
       unaligned alignment
       [] AGCTTTGAAGAGACTACTGTCGATTTCTCCTGTCGCTGTCCGTGCGGACT
       30
       31
       32
       33
       34
       35
       36
       37
       38
       39
       40
       41
       42
       43
       44
       45
       46
       47
       48
       49
       50
       51
       52
       53
       54
       55
       56
       57
       58
       59
       60
       61
 

       431
       432
       433
       434
       435
       436
       437
       438
       438
       unaligned alignment
       [] TGAATCGCTTAATCCAACACTGAGGCCGGTCGTGGAGACGCGATGCAGTG
       439
       440
       441
       442
       443
       444
       444
       unaligned alignment
       [] CCGAAGAGAAGCCACAAGTAGCGAGACCCGGAACTGGATAAGCTGCGATG
       445
       446
       447
       448
       449
       450
       451
       451
       unaligned alignment
       [] ACTTGTGTCCTTTGTCCCCTTGGAAGTTAGCTCGCGACGCTAGAATTTCA
       452
       453
       454
       454
       unaligned alignment
       [] GGTAAGGTATATCCTTCGCCGGTCACAACTCGAACAACGAGGATGTACGG
       455
       456
       457
       458
       459
       460
       461
       462
       463
       464
       465
       466
       467
       467
       unaligned alignment
       [] AGACCAATGCCTGGATACCATTCGCTACTAACAGTCCATAAAAATGTCGC
       468
       469
       470
       471
       472
       473
       474
       475
       475

       845
       846
       847
       848
       849
       850
       851
       852
       853
       854
       855
       856
       856
       hidden_gene alignment
       [(0, 6455659, 50)] GAGTGCAGTGGCGCGATCCCGGCCCACTGCAAGCTCTGCCTCCCGGGTTC
       857
       858
       859
       860
       861
       862
       863
       864
       865
       866
       867
       868
       869
       870
       871
       872
       873
       874
       875
       876
       877
       878
       879
       880
       881
       882
       883
       884
       885
       886
       887
       888
       889
       890
       891
       892
       893
       894
       895
       896
       897
       898
       898
       unaligned alignment
       [] TTTATGAGTGATCTTTGTATGGTTAAGGACGACTCACAGTCTCCGAAGCC
       899
       900
       901
       902
       903
       904
       905
       905
       unaligned alignment
       [] CAACCTCATATGGATAGACCGGAGGGAAAGCTGTCCGAGTGGAACTGCGT
       906
   