From 6e7b941b88cf4c09ae54c0bd1f149270440d28a6 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 10 Oct 2018 19:43:41 +0100 Subject: [PATCH] Fixed #133. Now implementing the unit-test --- Mikado/preparation/annotation_parser.py | 10 +- Mikado/preparation/checking.py | 6 +- Mikado/tests/prepare_phase_correctdness.gff | 4 + Mikado/tests/prepare_phase_correctedness.fa | 113 ++++++++++++++++++++ 4 files changed, 126 insertions(+), 7 deletions(-) create mode 100644 Mikado/tests/prepare_phase_correctdness.gff create mode 100644 Mikado/tests/prepare_phase_correctedness.fa diff --git a/Mikado/preparation/annotation_parser.py b/Mikado/preparation/annotation_parser.py index ca358174f..718c4e3eb 100644 --- a/Mikado/preparation/annotation_parser.py +++ b/Mikado/preparation/annotation_parser.py @@ -233,12 +233,12 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True exons.append(current) current = segment elif segment[0] == current[1] + 1: - current = (current[0], segment[1]) + current = (current[0], segment[1], None) else: logger.warning("Overlapping segments found in %s. Discarding it", tid) continue exons.append(current) - exon_lines[tid]["features"]["exon"] = exons[:] + exon_lines[tid]["features"]["exon"] = exons[:] else: raise KeyError(exon_lines[tid]["features"]) @@ -337,7 +337,7 @@ def load_from_gff(shelf_name, # Here we have to add the match feature as an exon, in case it is the only one present if row.feature == "match": exon_lines[row.id]["features"][row.feature] = [] - exon_lines[row.id]["features"][row.feature].append((row.start, row.end)) + exon_lines[row.id]["features"][row.feature].append((row.start, row.end, row.phase)) exon_lines[row.id]["strand_specific"] = strand_specific continue @@ -392,7 +392,7 @@ def load_from_gff(shelf_name, if row.feature not in exon_lines[tid]["features"]: exon_lines[tid]["features"][row.feature] = [] - exon_lines[tid]["features"][row.feature].append((row.start, row.end)) + exon_lines[tid]["features"][row.feature].append((row.start, row.end, row.phase)) new_ids.add(tid) else: continue @@ -499,7 +499,7 @@ def load_from_gtf(shelf_name, exon_lines[row.transcript]["attributes"].update(row.attributes) if row.feature not in exon_lines[row.transcript]["features"]: exon_lines[row.transcript]["features"][row.feature] = [] - exon_lines[row.transcript]["features"][row.feature].append((row.start, row.end)) + exon_lines[row.transcript]["features"][row.feature].append((row.start, row.end, row.phase)) new_ids.add(row.transcript) gff_handle.close() load_into_storage(shelf_name, exon_lines, logger=logger, min_length=min_length, strip_cds=strip_cds) diff --git a/Mikado/preparation/checking.py b/Mikado/preparation/checking.py index c4a4a36d5..005a3f718 100644 --- a/Mikado/preparation/checking.py +++ b/Mikado/preparation/checking.py @@ -70,8 +70,10 @@ def create_transcript(lines, transcript_line.parent = lines["parent"] for feature in lines["features"]: - transcript_line.add_exons(lines["features"][feature], - features=feature) + coords = [(_[0], _[1]) for _ in lines["features"][feature]] + phases = [_[2] for _ in lines["features"][feature]] + transcript_line.add_exons(coords, features=feature, phases=phases) + transcript_object = TranscriptChecker(transcript_line, fasta_seq, lenient=lenient, diff --git a/Mikado/tests/prepare_phase_correctdness.gff b/Mikado/tests/prepare_phase_correctdness.gff new file mode 100644 index 000000000..c6bac8645 --- /dev/null +++ b/Mikado/tests/prepare_phase_correctdness.gff @@ -0,0 +1,4 @@ +test test gene 1 6706 1 + . ID=test_0245940;Name=test_0245940; +test test mRNA 1 6706 . + . ID=test_0245940.1;Parent=test_0245940;Name=test_0245940.1 +test test CDS 1 370 . + 2 ID=test_0245940..cds1;Parent=test_0245940.1 +test test exon 1 370 . + . ID=test_0245940.exon1;Parent=test_0245940.1 \ No newline at end of file diff --git a/Mikado/tests/prepare_phase_correctedness.fa b/Mikado/tests/prepare_phase_correctedness.fa new file mode 100644 index 000000000..f2061da09 --- /dev/null +++ b/Mikado/tests/prepare_phase_correctedness.fa @@ -0,0 +1,113 @@ +>test +ATGATGCAGCAGATGCAGTGGGTGATGCAGTGGGTGCAGTGAATGGTGCAGTGAGTGGAG +AAATGGATGATGCAGTGGATGCTGTGGATGCTGTGGATGCAGTGGATGCAGAGAGTGGTG +CAGCGAGTGGAGAAATGGATGATGCAGTGGATGCAGTGAATGCTGTGGGTGATGTGGTGG +ATGACTCAGTGGGTGCTGTGGACACAGTGGTGGATGTAGAAGATGCTGTCTACACAGTGT +ATGCAGTGATTTATGCAGTGGATGAAGAAGCAGAAGTAGCCCAGAGCTCAGCTTTCCATC +AATGGCACTTACAGCGGCCACCACATGCCACCATGACCCTCATGCCCTTTATTCCAGAAG +CTTCATGGAGGTAGGTTTGGGGACAGTGTGGGCATGGCTAGAGAGATGAAAGAATAGACT +AAAGTCCTGAAGTCACCTCATTACACACTGTGCACCCTCCAGGGGTCTGTCACCAAGAAG +TGCTGGGCCTCTGGAGACAGACAAGATGAGACCCTCAAAGCAGAGTCTGAGCTTGGGTGA +GGAAAGACTAGGCCAGATGTGCACGCTAAAAATATCCACATTCCGTAGACTCCAGTAAGA +AGGGCTAGCTACTCTGGCTGTGGCACCGGAAGCTTGCCTGGAGCAGCCTTGACCTTCACA +AGCCCCGCTCCCACCCCAGCCTCCCATGATCACCCCAGTCAGTGATGGGCAGCCTGGGTA +CCAGGATGTCAAAGAAACTTCCCTTGGGTGGTGAGATGGCGCAGCAGGCAAAGGTGCTTG +CCAACAAGCCTGCAGACTTGAGTTCGATTCCCAAAGCCCCCATAGTAGAAAGAGAGAATC +CATGCCCACAAATTGTCTCATGACATTCACACATTCTTAGTGTCACACATGCGCACACAC +ACATACACGCAATAAGTAAATAAATACTATTATAAAAATAAGCTTCTCTCAGGCCCCCAA +ATATATCCCTCCTCACCAGCCAAATGAATTCCAGATGGATTAAATTCAAACATTAAAGTC +ATAAAAAGCCCAAGAAAGGAGTAGAATATCATTAAACACGTATAGAAGCCTGGAGAGAAA +AAAGACAAATATAAAACCTGCAGCCATAAAAAAAAAAAAAAAAGGACAAGCAGATCCAAC +TGTACAAAAATTAAAGTCTCTATGTAGTAAAAAAAAAAAAAATTAAAAACATAAACTATA +TAAACTGGAAAGAATATCTGGAGCCAGCCTTCTTGGTGTTCAGAAAGTTCATTTATCTAT +CCTCATGAAAAAGACAGGGTTCAAACAAAGAAACTGCATAACACACACACGTACACACAT +ACCCACAGAGACACACACATATATATGCAAACAAAACAAAACAAAACACAGAAACAGTCC +TAAAACAAACATCCCCGGCCAATGAACACACACACACACACACACACCTGTGGCTGATGT +GAAGTGAGTGAGTCAAGTGAGCAAAGATGAATCAGGCTGGTGATGCCCAGAGACAGGGAG +CTGGGGTTCATGCATGCCTAGGGAAGGCCGGATCTGCATAGCTGCTCGGAGTGAGGCCGG +GGGATACCTGTTGGAACCCAATGTACAATACCCCTCATCCAGTGGGTCCAATTCGCAGCA +TCTGTTCTGTGCACACATCTGCCACATGCTCTGAGACATACACTCAAGGGCAGGTCAACA +TGCTCTAGTATTTGTGTGAAGACGATGGGGCAGGTGGTGACGGAGCTATGATTAAGTACC +CAAGCTTGTGAAAGAATTAGGAAGTGCCTTGGAGAGACCTTTAAGGTGTGCAGATCTGCA +GAACAGGATGATCTGAAGGTTTGGTATCATCTCAGCTTGGGAGATAGAGGCAGGAGGATG +ACCAGAAAACTCCTGGCCAGCTTAGACTATTGAATAAAGACCCAGTCTCAAAACATTAGT +CAGGAATGGTGATACGTGTCTATAATCTTAACACGTGAGAGGGGACAATAGGAGGACCAG +CTGTTCAAGAAAGATCAGTGACTACAAAGAGAGTTCAAGGGCAGCCTGGGTCACATAAGA +CCCTGCCTCAAAAAGTGCCTGAGGGCTGTAGAGATGGCTCAGTGATTAAGAGCACTGGCT +GCTCCTTCAGAGAATCTGGGTTCAATTCCCAGCACCCACATGACAGCTTAGAAATGTCTA +TAACTCCCGATCCAAGGGATCTGACACCTTCACACATATGCAGTCAACACACCAATGCAC +ATACAAGTAAAACTATTTTTTTTAAGTAAAGAAGGAAAAGAATTAATATCCTCCAGTTGC +TCCAGATATTATTTTTAAAAACAAACAAAACAGCCAGGGACTGTGGAGACCAAGGTGGAG +GACTGGTATGAGCTTGGGGCCAGCCTGTAATACATAGTCTGAGGCTAGCCTGGACCACAT +AGTGTGACCTGTCTCAGAATCCAGGGACTGTGGATGTAGGTCAGTGTAGAACACTTGCCT +GGTACCCTGAGAGCCTGGGTTTCATTGCCAGCACTGCAGAGATCTGAGCATCCTCAGGAG +TTATCGCTTCTTCTGCCGTGGGATGTGCACGCCCAGTCCTCCCTGGAGTCTGGCTCTATC +CACTGGTGCCTGGCAGACAGCACAGGAAGCTGATGGGTGGTCCTGGCTCCAGACTTCAGT +CCCACACAGTGTATTTCTCCCATCCATCTCCCATTGGTAACAGAATCGATATCCATTTAA +CCCCTCCCCGCTTGCTGGCCTCCCACTGCTTACAGGAAACTGTCCCTACTCTGTAACACA +CAGCATTAAAGGTCCTTGGCAGCCTGCTCCTCCCTTACCCACCAACTTCATCTGCCTACT +TTCAAACCAATCAGGTGCTTGGGAGAGCTAAAACCAGCAGGCCTGGGGACAGCGGTCCTC +CACCTGTTAGCCCGTGACCGTGGCCAACTCAGTCTCTCCCGGCTACATCTGAGATCTTCA +TCTGTAAAAGTGAGCTAATGCCATCTGGATGCTGGAGGAGGGGGACAGTGAGGCAGTGCA +TGTGTAGAAACCGCATGTCACACATCGTAGGGCATGTAGCGCAGAGGACACTCTTGGCGT +CCCTTTGTTCCGGTCTTTGTGAAGGCCCTTCCTCCTGTCTCCTCATCTTCTGGCCAGGAA +ACTCCTTGTAAACTCGTCCCTGACCACCCACACTGTGGTCCCCGGAATGTAAATTCATTT +GCTTGTGATGACAGCCAGCACTGGCGATGAGTATGGTCCGTGTCGACCGGCTTCTGGTGG +TCCCCAGCGGATGAGTCACGGCTTCCGTACCACTTGGCATCAACTCGGACAGAGTGTAGG +CGTGAGCTCAGCGGATGCTAAGTGCTTGTTTTGCACATCTGTGGGTGTGCCATTTGTCGG +TTTTGTTTGGGGTGGTTTGGTTTCTTGAGACAAAAGTATCTCACTAGGTAGCTCAGGATA +GTCTCGAACCTGTGGTCCCCCTGACTCTGTTTCCAAGTGCTGGATTTACAGATAGGTGCC +ACCACATCCAACCTACAGTACTTTTGGTTGCTGCCCAGCCTGGCCTTAAACTCTCTATGT +GGCTTTGAACTCTTGGTCTTGCTGCCTTGTCCTCTCGCACGCTGAGATCACAGGCATGCA +TGCTCAACTAGTATTCATTTTTTTTTGAGTGAATATATTTACCCACCTAACCTCAGTAAG +CCCCTGAGCACCCACTTTATTATCTATCTGCCTTATAAGAAGACTGCATGGGAAAAATGA +AAGAGGATCACAGAGTGTGCATGCACACAGGGCTGCCTCTGTGCTCACCCGCTGTGTTTT +TATCCAAGTTGCACAGATCTTCGCTCACGATTTTCCGCAGTGTTCTGCCATTTCTTTTAC +ACCCAAGCCCTCAGCTCATTGCACACCTGTGCATGTAAACAGTTAGATGCAATGTAACCC +TTGGGTGCAATAACCACACCTTCTCTTTCAAATGGAATGTTTATGAACAGAGAGGCAAGG +AGAGCAGCGTTCTGCCTCCAATTACAGATGCATAGGCCTCCAAAAACAGCATCCTTACGC +TCTTCCGGTCCACAAGCCGGAAAATGTCCGAACAGATGCTTCCAAACTGTGAGGGAAAAA +AAAATAAACAAGCACACAAATGGCTGCCTGGCTGGTCCCTGGAGGCCGGAGTCAGGCTGG +AATACAAGTTATAGGCGGTGAGAACCTTGGAATGGAAATCCTACTTGAAGACTCAAGGGA +TGAGGAGGAAACTGCATCCATTTGGCGGCTCCTTTGCACCTAATGCTCTGCTGAGGTCTT +TAGCAGCCAGTTTCATTTCACTTTGTTTTTAATTATTTCAAATGTATACTTATTTGTGGG +GGTGATGGGCAGGGTTCATGTCAGGGCACTTATGTGGAGATTGGAAGGAAATGTGGGAAT +CCTATTCTCTGCTTCCACCTCGAGGGCCCCCAGGGACTGAAATCAAGTCATCAGGATTGG +CAGTAAGCGCCTTTAGCTGATGAGGCCATCAGGCCTGGCTTTAATATCTATTCTGTGAGA +TGGGACGAGAAGGAAAGTAAGGTGGGTGGGGTGCACAGATATGGCCAGAATTCGGTGCCC +CTCTAGACTCCGTATTCAGCTTTTACACTATGCCGTGGAATGCTGCTGGGAGGCTGACAT +GGCTTCTTCTTTGCAAGAGACAGCCACACACCCTTGGATGCCCGTGAGCTTCTGGATGGT +TGAAAACCACTCACTCCTCTCCTCTCCCCAGGCAGGAAGATGTCTGTTCCTAGAGAAGTC +CAGAACTCACACAGCGGGAGAAAGGTTATACTGCCCCTTGTGTTGGGAGGACCAGCACAC +ACCTGCAACTCTACAGGAGTGCTGGGACCCCCATCTCTCTTCGTGCCTAGGCTGGACCCA +GGCTCTGACTTCTCACCAACGGAGACACCGGTGTCATTCACTCAGCTCCCATCCCGAGTT +CTAGAATGGTGCATGGCCCAGCGATCACAGTGCATAAGATGGGCCTGTGTGTAGACTTGC +CTCCCAAGATGCCGGGCAAGCCACCCGCACTCCACACCCGCGTTTTGTTCTTTTTAAAAG +ACGTGTCTATTGATTGCAGTGAACCTCTATGATCCTCAACACATCTGTACCGAGAGTGTT +CCCAGGCAAGCCAACGCTGTCCTCACCTTGGTTTGATAGACGTTAAGACCCTCCCACATC +AGCAGAACGCAGGAACGCCACAGTCTGAGTACAGGGGAGTGGTGGGGCTAGACAGTGATG +GAGAAAAATCAGAGTCTCCATGTAACTGATGGAAGACGATGTGTTGTCGAATCCGATGAA +AGATTTTTCTAGTCTTTGTTTTGCCAGAGAGGCCTTATATACATTTCCTTTAACAGGCTT +CCCTGACAGGAAGTCATTTCCAAAGACTCTGCAGCCTGAAGCTTCTTGGGGATTTCTTTT +CCTCTGGCCGACCTCAGTTTCTCCTATCTGTAAAATGGAGCCATTCGTAAAGAGTCAAAG +GTACTCTAAGGCTCCTGAGGCTTCCAGGACCCTAGATTTGGAGCACTCCAGCCAACAGCT +CAGGTCACCTTTTAAAGTGCTACACTGTTAACAAGTCCTGCCTCTTCACGTTACAGACCG +CTTGCCTGTGACAGAAATTGGGGTGGTCACCTGTCAGGATCCTCTGCCCAAGTGGTCCTA +CACCCTCACTTATAGCTAGTTATCAATGAAGGGTGTGTGTGTGTGTGTGTGTGTGTGCAG +GCACACACTTGATCAAAGAACAGGCCTCCCTCAGGTTTTTTTTTTTCTTCAATATGCAAG +CTGTCACAGATCTCAAGGCAAGATATTATTTCTGATAAATAGAGGCAGACAATAGATGAT +TTCTATATTAATATCACTAAAGAAAAAGTGCCCCTCTCCTGCAACTCACGGGAGACAGAG +GAGCCAGTGTATAGAGTTAGGCTACAGACACCGAACAGAGACTCTGGCACTGTGACAGTT +GGAGCACCGCCCCCCACCCCCACCTTGCACTGCACTCCCCCCTCCCCTCCCCTCTCCTCC +TTCACCTCACCCAGTTCGGAGAAAGAGGGTCTGCATTTTCCTCCTGCTGATGGTGGTTGA +TTAGCAGTCAGGTGGAGCGATGCTCCGGGGACAGCACAACAGATTAATTAAATCACTCTG +GCTGGCTAAGTGGTGACACTGTGATTTATAGCGCCGCTCACTCAACCAGACAGCAGGCTT +TTCTTGCATGCTGAAGGAAGGAGGGCCCGAGCTGGACCCCGGGTACAGACCTCAGGGGGC +GCTATGAACTTTCAGGATGAGCATTTTTACCCTTAAGGCAGAGGGCCCCCTGCTGGGAGA +CTCACAAGGGAGAGGAAGCAGAGGGAGTGCCAGGCCGCTGTGCAGTGGTGTGTCTCTGCA +GAAGAGAAGCCTCAAAAAAGATTCGGCTGTGGCTCACCCTGAAGAAACAGCCCCCTGCCA +GTCTCAGCCTGGTGACTCCGGGCCTTTGTCCCTGTTTTCCAGGACGGCATTGTCCCCTCA +GCGAGATGCCCTACTGTTCCTCTTAGATGTTCCGTGGGCGGCTGCCTCCACTTCCCTGAG +GAACTAGGTCACACAGGCTAGTGTGACAACGGGGGCTAGTCTAACACATTTAGCAAGTGC +CGGCTCTGTGATATCCTGTGTGGGAGCGGGGAGCTAGCTCATTTCTACACCATTCCAGAA +CACACACACACACACACACACACACACACACACACACACACACACA \ No newline at end of file