Skip to content

Commit

Permalink
Fixed EI-CoreBioinformatics#133. Now implementing the unit-test
Browse files Browse the repository at this point in the history
  • Loading branch information
lucventurini committed Oct 10, 2018
1 parent e0b50ea commit 6e7b941
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 7 deletions.
10 changes: 5 additions & 5 deletions Mikado/preparation/annotation_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,12 +233,12 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True
exons.append(current)
current = segment
elif segment[0] == current[1] + 1:
current = (current[0], segment[1])
current = (current[0], segment[1], None)
else:
logger.warning("Overlapping segments found in %s. Discarding it", tid)
continue
exons.append(current)
exon_lines[tid]["features"]["exon"] = exons[:]
exon_lines[tid]["features"]["exon"] = exons[:]
else:
raise KeyError(exon_lines[tid]["features"])

Expand Down Expand Up @@ -337,7 +337,7 @@ def load_from_gff(shelf_name,
# Here we have to add the match feature as an exon, in case it is the only one present
if row.feature == "match":
exon_lines[row.id]["features"][row.feature] = []
exon_lines[row.id]["features"][row.feature].append((row.start, row.end))
exon_lines[row.id]["features"][row.feature].append((row.start, row.end, row.phase))

exon_lines[row.id]["strand_specific"] = strand_specific
continue
Expand Down Expand Up @@ -392,7 +392,7 @@ def load_from_gff(shelf_name,

if row.feature not in exon_lines[tid]["features"]:
exon_lines[tid]["features"][row.feature] = []
exon_lines[tid]["features"][row.feature].append((row.start, row.end))
exon_lines[tid]["features"][row.feature].append((row.start, row.end, row.phase))
new_ids.add(tid)
else:
continue
Expand Down Expand Up @@ -499,7 +499,7 @@ def load_from_gtf(shelf_name,
exon_lines[row.transcript]["attributes"].update(row.attributes)
if row.feature not in exon_lines[row.transcript]["features"]:
exon_lines[row.transcript]["features"][row.feature] = []
exon_lines[row.transcript]["features"][row.feature].append((row.start, row.end))
exon_lines[row.transcript]["features"][row.feature].append((row.start, row.end, row.phase))
new_ids.add(row.transcript)
gff_handle.close()
load_into_storage(shelf_name, exon_lines, logger=logger, min_length=min_length, strip_cds=strip_cds)
Expand Down
6 changes: 4 additions & 2 deletions Mikado/preparation/checking.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,10 @@ def create_transcript(lines,
transcript_line.parent = lines["parent"]

for feature in lines["features"]:
transcript_line.add_exons(lines["features"][feature],
features=feature)
coords = [(_[0], _[1]) for _ in lines["features"][feature]]
phases = [_[2] for _ in lines["features"][feature]]
transcript_line.add_exons(coords, features=feature, phases=phases)

transcript_object = TranscriptChecker(transcript_line,
fasta_seq,
lenient=lenient,
Expand Down
4 changes: 4 additions & 0 deletions Mikado/tests/prepare_phase_correctdness.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
test test gene 1 6706 1 + . ID=test_0245940;Name=test_0245940;
test test mRNA 1 6706 . + . ID=test_0245940.1;Parent=test_0245940;Name=test_0245940.1
test test CDS 1 370 . + 2 ID=test_0245940..cds1;Parent=test_0245940.1
test test exon 1 370 . + . ID=test_0245940.exon1;Parent=test_0245940.1
113 changes: 113 additions & 0 deletions Mikado/tests/prepare_phase_correctedness.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
>test
ATGATGCAGCAGATGCAGTGGGTGATGCAGTGGGTGCAGTGAATGGTGCAGTGAGTGGAG
AAATGGATGATGCAGTGGATGCTGTGGATGCTGTGGATGCAGTGGATGCAGAGAGTGGTG
CAGCGAGTGGAGAAATGGATGATGCAGTGGATGCAGTGAATGCTGTGGGTGATGTGGTGG
ATGACTCAGTGGGTGCTGTGGACACAGTGGTGGATGTAGAAGATGCTGTCTACACAGTGT
ATGCAGTGATTTATGCAGTGGATGAAGAAGCAGAAGTAGCCCAGAGCTCAGCTTTCCATC
AATGGCACTTACAGCGGCCACCACATGCCACCATGACCCTCATGCCCTTTATTCCAGAAG
CTTCATGGAGGTAGGTTTGGGGACAGTGTGGGCATGGCTAGAGAGATGAAAGAATAGACT
AAAGTCCTGAAGTCACCTCATTACACACTGTGCACCCTCCAGGGGTCTGTCACCAAGAAG
TGCTGGGCCTCTGGAGACAGACAAGATGAGACCCTCAAAGCAGAGTCTGAGCTTGGGTGA
GGAAAGACTAGGCCAGATGTGCACGCTAAAAATATCCACATTCCGTAGACTCCAGTAAGA
AGGGCTAGCTACTCTGGCTGTGGCACCGGAAGCTTGCCTGGAGCAGCCTTGACCTTCACA
AGCCCCGCTCCCACCCCAGCCTCCCATGATCACCCCAGTCAGTGATGGGCAGCCTGGGTA
CCAGGATGTCAAAGAAACTTCCCTTGGGTGGTGAGATGGCGCAGCAGGCAAAGGTGCTTG
CCAACAAGCCTGCAGACTTGAGTTCGATTCCCAAAGCCCCCATAGTAGAAAGAGAGAATC
CATGCCCACAAATTGTCTCATGACATTCACACATTCTTAGTGTCACACATGCGCACACAC
ACATACACGCAATAAGTAAATAAATACTATTATAAAAATAAGCTTCTCTCAGGCCCCCAA
ATATATCCCTCCTCACCAGCCAAATGAATTCCAGATGGATTAAATTCAAACATTAAAGTC
ATAAAAAGCCCAAGAAAGGAGTAGAATATCATTAAACACGTATAGAAGCCTGGAGAGAAA
AAAGACAAATATAAAACCTGCAGCCATAAAAAAAAAAAAAAAAGGACAAGCAGATCCAAC
TGTACAAAAATTAAAGTCTCTATGTAGTAAAAAAAAAAAAAATTAAAAACATAAACTATA
TAAACTGGAAAGAATATCTGGAGCCAGCCTTCTTGGTGTTCAGAAAGTTCATTTATCTAT
CCTCATGAAAAAGACAGGGTTCAAACAAAGAAACTGCATAACACACACACGTACACACAT
ACCCACAGAGACACACACATATATATGCAAACAAAACAAAACAAAACACAGAAACAGTCC
TAAAACAAACATCCCCGGCCAATGAACACACACACACACACACACACCTGTGGCTGATGT
GAAGTGAGTGAGTCAAGTGAGCAAAGATGAATCAGGCTGGTGATGCCCAGAGACAGGGAG
CTGGGGTTCATGCATGCCTAGGGAAGGCCGGATCTGCATAGCTGCTCGGAGTGAGGCCGG
GGGATACCTGTTGGAACCCAATGTACAATACCCCTCATCCAGTGGGTCCAATTCGCAGCA
TCTGTTCTGTGCACACATCTGCCACATGCTCTGAGACATACACTCAAGGGCAGGTCAACA
TGCTCTAGTATTTGTGTGAAGACGATGGGGCAGGTGGTGACGGAGCTATGATTAAGTACC
CAAGCTTGTGAAAGAATTAGGAAGTGCCTTGGAGAGACCTTTAAGGTGTGCAGATCTGCA
GAACAGGATGATCTGAAGGTTTGGTATCATCTCAGCTTGGGAGATAGAGGCAGGAGGATG
ACCAGAAAACTCCTGGCCAGCTTAGACTATTGAATAAAGACCCAGTCTCAAAACATTAGT
CAGGAATGGTGATACGTGTCTATAATCTTAACACGTGAGAGGGGACAATAGGAGGACCAG
CTGTTCAAGAAAGATCAGTGACTACAAAGAGAGTTCAAGGGCAGCCTGGGTCACATAAGA
CCCTGCCTCAAAAAGTGCCTGAGGGCTGTAGAGATGGCTCAGTGATTAAGAGCACTGGCT
GCTCCTTCAGAGAATCTGGGTTCAATTCCCAGCACCCACATGACAGCTTAGAAATGTCTA
TAACTCCCGATCCAAGGGATCTGACACCTTCACACATATGCAGTCAACACACCAATGCAC
ATACAAGTAAAACTATTTTTTTTAAGTAAAGAAGGAAAAGAATTAATATCCTCCAGTTGC
TCCAGATATTATTTTTAAAAACAAACAAAACAGCCAGGGACTGTGGAGACCAAGGTGGAG
GACTGGTATGAGCTTGGGGCCAGCCTGTAATACATAGTCTGAGGCTAGCCTGGACCACAT
AGTGTGACCTGTCTCAGAATCCAGGGACTGTGGATGTAGGTCAGTGTAGAACACTTGCCT
GGTACCCTGAGAGCCTGGGTTTCATTGCCAGCACTGCAGAGATCTGAGCATCCTCAGGAG
TTATCGCTTCTTCTGCCGTGGGATGTGCACGCCCAGTCCTCCCTGGAGTCTGGCTCTATC
CACTGGTGCCTGGCAGACAGCACAGGAAGCTGATGGGTGGTCCTGGCTCCAGACTTCAGT
CCCACACAGTGTATTTCTCCCATCCATCTCCCATTGGTAACAGAATCGATATCCATTTAA
CCCCTCCCCGCTTGCTGGCCTCCCACTGCTTACAGGAAACTGTCCCTACTCTGTAACACA
CAGCATTAAAGGTCCTTGGCAGCCTGCTCCTCCCTTACCCACCAACTTCATCTGCCTACT
TTCAAACCAATCAGGTGCTTGGGAGAGCTAAAACCAGCAGGCCTGGGGACAGCGGTCCTC
CACCTGTTAGCCCGTGACCGTGGCCAACTCAGTCTCTCCCGGCTACATCTGAGATCTTCA
TCTGTAAAAGTGAGCTAATGCCATCTGGATGCTGGAGGAGGGGGACAGTGAGGCAGTGCA
TGTGTAGAAACCGCATGTCACACATCGTAGGGCATGTAGCGCAGAGGACACTCTTGGCGT
CCCTTTGTTCCGGTCTTTGTGAAGGCCCTTCCTCCTGTCTCCTCATCTTCTGGCCAGGAA
ACTCCTTGTAAACTCGTCCCTGACCACCCACACTGTGGTCCCCGGAATGTAAATTCATTT
GCTTGTGATGACAGCCAGCACTGGCGATGAGTATGGTCCGTGTCGACCGGCTTCTGGTGG
TCCCCAGCGGATGAGTCACGGCTTCCGTACCACTTGGCATCAACTCGGACAGAGTGTAGG
CGTGAGCTCAGCGGATGCTAAGTGCTTGTTTTGCACATCTGTGGGTGTGCCATTTGTCGG
TTTTGTTTGGGGTGGTTTGGTTTCTTGAGACAAAAGTATCTCACTAGGTAGCTCAGGATA
GTCTCGAACCTGTGGTCCCCCTGACTCTGTTTCCAAGTGCTGGATTTACAGATAGGTGCC
ACCACATCCAACCTACAGTACTTTTGGTTGCTGCCCAGCCTGGCCTTAAACTCTCTATGT
GGCTTTGAACTCTTGGTCTTGCTGCCTTGTCCTCTCGCACGCTGAGATCACAGGCATGCA
TGCTCAACTAGTATTCATTTTTTTTTGAGTGAATATATTTACCCACCTAACCTCAGTAAG
CCCCTGAGCACCCACTTTATTATCTATCTGCCTTATAAGAAGACTGCATGGGAAAAATGA
AAGAGGATCACAGAGTGTGCATGCACACAGGGCTGCCTCTGTGCTCACCCGCTGTGTTTT
TATCCAAGTTGCACAGATCTTCGCTCACGATTTTCCGCAGTGTTCTGCCATTTCTTTTAC
ACCCAAGCCCTCAGCTCATTGCACACCTGTGCATGTAAACAGTTAGATGCAATGTAACCC
TTGGGTGCAATAACCACACCTTCTCTTTCAAATGGAATGTTTATGAACAGAGAGGCAAGG
AGAGCAGCGTTCTGCCTCCAATTACAGATGCATAGGCCTCCAAAAACAGCATCCTTACGC
TCTTCCGGTCCACAAGCCGGAAAATGTCCGAACAGATGCTTCCAAACTGTGAGGGAAAAA
AAAATAAACAAGCACACAAATGGCTGCCTGGCTGGTCCCTGGAGGCCGGAGTCAGGCTGG
AATACAAGTTATAGGCGGTGAGAACCTTGGAATGGAAATCCTACTTGAAGACTCAAGGGA
TGAGGAGGAAACTGCATCCATTTGGCGGCTCCTTTGCACCTAATGCTCTGCTGAGGTCTT
TAGCAGCCAGTTTCATTTCACTTTGTTTTTAATTATTTCAAATGTATACTTATTTGTGGG
GGTGATGGGCAGGGTTCATGTCAGGGCACTTATGTGGAGATTGGAAGGAAATGTGGGAAT
CCTATTCTCTGCTTCCACCTCGAGGGCCCCCAGGGACTGAAATCAAGTCATCAGGATTGG
CAGTAAGCGCCTTTAGCTGATGAGGCCATCAGGCCTGGCTTTAATATCTATTCTGTGAGA
TGGGACGAGAAGGAAAGTAAGGTGGGTGGGGTGCACAGATATGGCCAGAATTCGGTGCCC
CTCTAGACTCCGTATTCAGCTTTTACACTATGCCGTGGAATGCTGCTGGGAGGCTGACAT
GGCTTCTTCTTTGCAAGAGACAGCCACACACCCTTGGATGCCCGTGAGCTTCTGGATGGT
TGAAAACCACTCACTCCTCTCCTCTCCCCAGGCAGGAAGATGTCTGTTCCTAGAGAAGTC
CAGAACTCACACAGCGGGAGAAAGGTTATACTGCCCCTTGTGTTGGGAGGACCAGCACAC
ACCTGCAACTCTACAGGAGTGCTGGGACCCCCATCTCTCTTCGTGCCTAGGCTGGACCCA
GGCTCTGACTTCTCACCAACGGAGACACCGGTGTCATTCACTCAGCTCCCATCCCGAGTT
CTAGAATGGTGCATGGCCCAGCGATCACAGTGCATAAGATGGGCCTGTGTGTAGACTTGC
CTCCCAAGATGCCGGGCAAGCCACCCGCACTCCACACCCGCGTTTTGTTCTTTTTAAAAG
ACGTGTCTATTGATTGCAGTGAACCTCTATGATCCTCAACACATCTGTACCGAGAGTGTT
CCCAGGCAAGCCAACGCTGTCCTCACCTTGGTTTGATAGACGTTAAGACCCTCCCACATC
AGCAGAACGCAGGAACGCCACAGTCTGAGTACAGGGGAGTGGTGGGGCTAGACAGTGATG
GAGAAAAATCAGAGTCTCCATGTAACTGATGGAAGACGATGTGTTGTCGAATCCGATGAA
AGATTTTTCTAGTCTTTGTTTTGCCAGAGAGGCCTTATATACATTTCCTTTAACAGGCTT
CCCTGACAGGAAGTCATTTCCAAAGACTCTGCAGCCTGAAGCTTCTTGGGGATTTCTTTT
CCTCTGGCCGACCTCAGTTTCTCCTATCTGTAAAATGGAGCCATTCGTAAAGAGTCAAAG
GTACTCTAAGGCTCCTGAGGCTTCCAGGACCCTAGATTTGGAGCACTCCAGCCAACAGCT
CAGGTCACCTTTTAAAGTGCTACACTGTTAACAAGTCCTGCCTCTTCACGTTACAGACCG
CTTGCCTGTGACAGAAATTGGGGTGGTCACCTGTCAGGATCCTCTGCCCAAGTGGTCCTA
CACCCTCACTTATAGCTAGTTATCAATGAAGGGTGTGTGTGTGTGTGTGTGTGTGTGCAG
GCACACACTTGATCAAAGAACAGGCCTCCCTCAGGTTTTTTTTTTTCTTCAATATGCAAG
CTGTCACAGATCTCAAGGCAAGATATTATTTCTGATAAATAGAGGCAGACAATAGATGAT
TTCTATATTAATATCACTAAAGAAAAAGTGCCCCTCTCCTGCAACTCACGGGAGACAGAG
GAGCCAGTGTATAGAGTTAGGCTACAGACACCGAACAGAGACTCTGGCACTGTGACAGTT
GGAGCACCGCCCCCCACCCCCACCTTGCACTGCACTCCCCCCTCCCCTCCCCTCTCCTCC
TTCACCTCACCCAGTTCGGAGAAAGAGGGTCTGCATTTTCCTCCTGCTGATGGTGGTTGA
TTAGCAGTCAGGTGGAGCGATGCTCCGGGGACAGCACAACAGATTAATTAAATCACTCTG
GCTGGCTAAGTGGTGACACTGTGATTTATAGCGCCGCTCACTCAACCAGACAGCAGGCTT
TTCTTGCATGCTGAAGGAAGGAGGGCCCGAGCTGGACCCCGGGTACAGACCTCAGGGGGC
GCTATGAACTTTCAGGATGAGCATTTTTACCCTTAAGGCAGAGGGCCCCCTGCTGGGAGA
CTCACAAGGGAGAGGAAGCAGAGGGAGTGCCAGGCCGCTGTGCAGTGGTGTGTCTCTGCA
GAAGAGAAGCCTCAAAAAAGATTCGGCTGTGGCTCACCCTGAAGAAACAGCCCCCTGCCA
GTCTCAGCCTGGTGACTCCGGGCCTTTGTCCCTGTTTTCCAGGACGGCATTGTCCCCTCA
GCGAGATGCCCTACTGTTCCTCTTAGATGTTCCGTGGGCGGCTGCCTCCACTTCCCTGAG
GAACTAGGTCACACAGGCTAGTGTGACAACGGGGGCTAGTCTAACACATTTAGCAAGTGC
CGGCTCTGTGATATCCTGTGTGGGAGCGGGGAGCTAGCTCATTTCTACACCATTCCAGAA
CACACACACACACACACACACACACACACACACACACACACACACA

0 comments on commit 6e7b941

Please sign in to comment.