diff --git a/.travis.yml b/.travis.yml index 24f113e..504c7c5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,39 +1,31 @@ -notifications: # set notification options - email: - recipients: - - mgopez95@gmail.com - - peter.kruczkiewicz@gmail.com - - # change is when the repo status goes from pass to fail or vice versa - on_success: change - on_failure: change - language: python python: - - '3.6' +- '3.6' os: - - linux -# - osx - -# Whitelist of branches to run CI Testing on. +- linux branches: only: - - master - - development - -# Blacklist of branches to not run CI testing on. -# branches: -# except: -# - dontrunme -# - test - -before_install: - - sudo apt-get -qq update; - -install: - - python setup.py install - #Install pytest for testing. - - pip install pytest - -script: - - pytest + - master + - development +install: pip install -U tox-travis +script: tox +deploy: + provider: pypi + distributions: sdist bdist_wheel + user: peterk87 + deployment: bio-hansel + password: + secure: feBsWxq8pSl/Eisu262EJH3E3JYgeWVmmwHCHut1474BxqXDvoUOGQmD7WoxwLCUVwvTE9gq37KZhYA4QDBwGI2vTwEegEeZGEo6ivPPO68TEkEFh00HwvXmFnYjllTtVISkUpBiUT8bYXdcPzR66bTfig+tZvlKIMATbHhzwFrWh1dkWkyiUojUaD4T861M0n774B0T8nyS4ppb7GT4CfkJXm9PpGMNWu04Cv0U2tTAIABT0OvvrstP1o1XFvOLNP3KI7aV/Onf24r3D5/jqtQN5ET90HFqvpoumRQi4XnJ10RWLySSOIlJ8HEE7giHvFz//TAGA4e3rBB4B0POun+qtAlriH2aq62vpBhm4jLZQOI8ARm6G1GPBKXZq+LnFJmQKFritYpNUubdpsaCK1ykoDgVOQh9FrAVXTcWvN6PPPwbrwZ2L5DkKsKyYK9QdtgM7Yhhr3cT2tM74jPQAeh9ky7iow4ltTlX/4NNegXwVxn7Gmn8Dkeg5dFeh/CcfaaHKSd3Se2DUe3vPldoQgkNJSUyFL65MvfftQP/pRRpLfPqkrHl6QjhCTdLLUvTTtyQtkvO5a3vNKtbScMK+epvGKrwSSPBIckYS0BUWHM/KoswnR+VPWh5wQmJWLHliukuseCx7qXJkABuZNsl8YtFXB+uCXGV9eoErR4sD5c= + on: + tags: true + repo: phac-nml/bio_hansel + python: 3.6 +notifications: + email: + recipients: + - mgopez95@gmail.com + - peter.kruczkiewicz@gmail.com + - darian.hole@canada.ca + - philip.mabon@canada.ca + on_success: change + on_failure: change diff --git a/bio_hansel/__init__.py b/bio_hansel/__init__.py index 19e40d4..ae19273 100644 --- a/bio_hansel/__init__.py +++ b/bio_hansel/__init__.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- -__version__ = '2.2.0' +__version__ = '2.3.0' program_name = 'bio_hansel' program_summary = 'Subtype microbial genomes using SNV targeting k-mer subtyping schemes.' program_desc = program_summary + ''' -Includes schemes for Salmonella enterica spp. enterica serovar Heidelberg, Enteritidis, and Typhi subtyping. Also includes a Mycobacterium Tuberculosis scheme. +Includes schemes for Salmonella enterica spp. enterica serovar Heidelberg, Enteritidis, Typhi, and Typhimurium subtyping. Also includes a Mycobacterium Tuberculosis scheme. Developed by Geneviève Labbé, James Robertson, Peter Kruczkiewicz, Marisa Rankin, Matthew Gopez, Chad R. Laing, Philip Mabon, Kim Ziebell, Aleisha R. Reimer, Lorelee Tschetter, Gary Van Domselaar, Sadjia Bekal, Kimberley A. MacDonald, Linda Hoang, Linda Chui, Danielle Daignault, Durda Slavic, Frank Pollari, E. Jane Parmley, David Son, Darian Hole, Philip Mabon, Elissa Giang, Lok Kan Lee, Jonathan Moffat, Marisa Rankin, Joanne MacKinnon, Roger Johnson, John H.E. Nash. ''' diff --git a/bio_hansel/const.py b/bio_hansel/const.py index f7f5442..56be316 100644 --- a/bio_hansel/const.py +++ b/bio_hansel/const.py @@ -13,13 +13,13 @@ 'version': '1.0.7', 'subtyping_params': SubtypingParams(low_coverage_depth_freq=50)}, 'typhi': {'file': resource_filename(program_name, 'data/typhi/kmers.fasta'), - 'version': '1.1.0', + 'version': '1.2.0', 'subtyping_params': SubtypingParams(low_coverage_depth_freq=20)}, - 'tb_speciation': {'file': resource_filename(program_name, 'data/m.tuberculosis/kmers.fasta'), - 'version': '1.0.1;', + 'tb_speciation': {'file': resource_filename(program_name, 'data/tb_speciation/kmers.fasta'), + 'version': '1.0.5', 'subtyping_params': SubtypingParams(low_coverage_depth_freq=20)}, 'typhimurium': {'file': resource_filename(program_name, 'data/typhimurium/kmers.fasta'), - 'version': '0.5.5;', + 'version': '0.5.5', 'subtyping_params': SubtypingParams(low_coverage_depth_freq=20)}} diff --git a/bio_hansel/data/m.tuberculosis/kmers.fasta b/bio_hansel/data/tb_speciation/kmers.fasta similarity index 91% rename from bio_hansel/data/m.tuberculosis/kmers.fasta rename to bio_hansel/data/tb_speciation/kmers.fasta index e03b565..98000d7 100644 --- a/bio_hansel/data/m.tuberculosis/kmers.fasta +++ b/bio_hansel/data/tb_speciation/kmers.fasta @@ -22,13 +22,13 @@ CGACATCCTCGATACGGGCCCCCTCGCGGATTG ACGCGTCCTTCGGGAAATGCGCTGGGACCCAAT >negative1491275-1.1.3 ACGCGTCCTTCGGGAAGTGCGCTGGGACCCAAT ->3479545-1.2.1 +>3479545-1.2 ACCGCAGTTTCAGTCGCAGCCTTGACTATCTAC ->negative3479545-1.2.1 -CCGCAGTTTCAGTCGCCGCCTTGACTATCTACG ->3470377-1.2.2 +>negative3479545-1.2 +ACCGCAGTTTCAGTCGCCGCCTTGACTATCTAC +>3470377-1.3 TGGCATCGTCATAGGCTTGCTGGCGGTTAAGGA ->negative3470377-1.2.2 +>negative3470377-1.3 TGGCATCGTCATAGGCCTGCTGGCGGTTAAGGA >497491-2 AGGGCTGGTCGGCCATATCGGGCCCGACGATAT @@ -62,26 +62,26 @@ ACCGGGAACAGTATTCGGATCCGGAAAAGCCGA CGAGGATCGAGCTAGCAGATATTGCGCTGCGCC >negative3273107-3 CGAGGATCGAGCTAGCCGATATTGCGCTGCGCC ->1084911-3.1.1 +>1084911-3.1 TGTTCATCTCCAGGAAATAGAACTCACCTTCCC ->negative1084911-3.1.1 +>negative1084911-3.1 TGTTCATCTCCAGGAAGTAGAACTCACCTTCCC ->3722702-3.1.2 +>3722702-3.2 GCTGCGCCCCGGCGGCCAGCACGGCCTCCAGCT ->negative3722702-3.1.2 +>negative3722702-3.2 GCTGCGCCCCGGCGGCGAGCACGGCCTCCAGCT ->1237818-3.1.2.1 +>1237818-3.2.1 CGCCGGCAGCCGTGCCGAGCATCAGCGGGAAAC ->negative1237818-3.1.2.1 +>negative1237818-3.2.1 CGCCGGCAGCCGTGCCCAGCATCAGCGGGAAAC ->2874344-3.1.2.2 +>2874344-3.2.2 CGTAGGCCTCCACCCGACGCACCCCGGAGCCGA ->negative2874344-3.1.2.2 +>negative2874344-3.2.2 CGTAGGCCTCCACCCGGCGCACCCCGGAGCCGA >931123-4 GGCGCATCGCCAACTATGAGGAGCTCGCCGCAC >negative931123-4 -GGCGCATCGCCAACTAGGAGGAGCTCGCCGCAC +GGCGCATCGCCAACTACGAGGAGCTCGCCGCAC >62657-4.1 TGTGGAAGCATTCCCCAAGCCGGTCAAGGCCGC >negative62657-4.1 @@ -245,5 +245,4 @@ CCCGCACGTAGCCCTTACCGAGGTCTCGGACGG >1882180-8 CCGGTGTTATCCATGCTGGCTATGGGGGCCAGC >negative1882180-8 -CCGGTGTTATCCATGCCGGCTATGGGGGCCAGC - +CCGGTGTTATCCATGCCGGCTATGGGGGCCAGC \ No newline at end of file diff --git a/bio_hansel/data/tb_speciation/metadata.tsv b/bio_hansel/data/tb_speciation/metadata.tsv new file mode 100644 index 0000000..7a57bb9 --- /dev/null +++ b/bio_hansel/data/tb_speciation/metadata.tsv @@ -0,0 +1,61 @@ +subtype Coll_et_al_genotype +1 1 +1.1 1.1 +2.1 2.1 +3 3 +4 4 +4.1 4.1 +4.2 4.2 +4.3 4.3 +4.4 4.4 +4.5 4.5 +4.6 4.6 +4.7 4.7 +4.8 4.8 +4.9 4.9 +5 5 +7 7 +8 bov_afr +8.1 bov +8.2 6 +1.1.1 1.1.1 +1.1.1.1 1.1.1.1 +1.1.2 1.1.2 +1.1.3 1.1.3 +1.2 1.2.1 +1.3 1.2.2 +2.2.1 2.2.1 +2.2.1.1 2.2.1.1 +2.2.1.2 2.2.1.2 +2.2.2 2.2.2 +3.1 3.1.1 +3.2 3.1.2 +3.2.1 3.1.2.1 +3.2.2 3.1.2.2 +4.1.1 4.1.1 +4.1.1.1 4.1.1.1 +4.1.1.2 4.1.1.2 +4.1.1.3 4.1.1.3 +4.1.2 4.1.2 +4.1.2.1 4.1.2.1 +4.2.1 4.2.1 +4.2.2 4.2.2 +4.2.2.1 4.2.2.1 +4.3.1 4.3.1 +4.3.2 4.3.2 +4.3.2.1 4.3.2.1 +4.3.3 4.3.3 +4.3.4 4.3.4 +4.3.4.1 4.3.4.1 +4.3.4.2 4.3.4.2 +4.3.4.2.1 4.3.4.2.1 +4.4.1 4.4.1 +4.4.1.1 4.4.1.1 +4.4.1.2 4.4.1.2 +4.4.2 4.4.2 +4.6.1 4.6.1 +4.6.1.1 4.6.1.1 +4.6.1.2 4.6.1.2 +4.6.2 4.6.2 +4.6.2.1 4.6.2.1 +4.6.2.2 4.6.2.2 \ No newline at end of file diff --git a/bio_hansel/data/typhi/kmers.fasta b/bio_hansel/data/typhi/kmers.fasta index 550d663..f384275 100644 --- a/bio_hansel/data/typhi/kmers.fasta +++ b/bio_hansel/data/typhi/kmers.fasta @@ -1,10 +1,10 @@ >655112-0.1 GGCTCCATCCTTAGACTTGGTCGGTAAAATCTA ->773487-0.0.1 +>773487-0.4 TCTATGGATTTGTTGTAGTGTGGGGTGTGTAAT ->1804415-0.0.2 +>1804415-0.2 TTATTACCTATAAAGGCGAACAAGGCGCGAAAG ->1840727-0.0.3 +>1840727-0.3 GGTTACGCGGTACGCAAAGTAACGCCTGGCGCG >3640678-0.1.1 ACACGCCGGTGGAAGTACTGCATTCCGGGCTAA @@ -14,25 +14,25 @@ ATGGCAGGTAGCCCAATTGGCGCGCCATCCGCA ACACGGGAATGTCGATAGTAAAGGCGTCGCCAT >316489-0 GCAGCCTGCCGCTGACCCCCTTTTTCTGGGGAT ->4105384-1.1 +>4105384-1 GATTTTTTTGCAACTGTTGCGATACGGCGCGAT ->555826-1.1.1 +>555826-1.1 AATTTAAGTTAATTACAGGTCCTTCTGGCTGCG ->2360997-1.1.2 +>2360997-1.2 CACTGAAGGCTTTACCTAGCGCTCTGGCATCTT ->4664137-1.1.3 +>4664137-1.3 AAGTCACCTGCGCGCCTTGGTCGAACGCGGCGA ->2166082-1.1.4 +>2166082-1.4 GCAATTCGCGGTAAAGAGTACTGGCAATAAGAC ->30192-1.2 +>30192-3 CGGGCGACGGTTCCGCTTGGCAACCTTTTGCTA ->4288272-1.2.1 +>4288272-3.1 CGGAACAGTATCTGGGTGCGAATTACTTTCCGG >2737027-2 ACGAATATGGGTTGCAAGACGTCCTGGATTGTA ->1215983-2.0.1 +>1215983-2.6 CGTTGTGGGAGCAGGGGGATGAGCAGGCGCACG ->4132985-2.0.2 +>4132985-2.7 CCAACCAGGTCTTTGCAGAAGTGTTACGCGGCA >146673-2.1 TACGTATGCAGGTACCAGCGCGGGTGGTGGCGT @@ -74,7 +74,7 @@ AGGCTGCGGCACAGAGTAAATGCAGAGCATGGT TTGCCACCCTTGTCACTGCCAGCGCTTTCGCTG >3092900-2.2.3.4 GCTGACCTTGCAGATATGCTTCGTTTTCTTTGT ->2723724-2.2.3.3.1 +>2723724-2.2.3.5 TTAATACACCGAACGGGCATCTGGTCGCCAGCG >3437570-2.4 AGACATCTGGCGGCGTCAGCAGCATCCCCACAA @@ -86,9 +86,9 @@ TATTCACCAGCACCGGTGCCAGACTGGCGTCAT CGCCAAACAGCACAAATGGCAGGTCATTTGCCA >3062270-2.3 TTTCTTGATCGCTTCATGCATGGTCAGTTTTTC ->1799842-2.3.0.1 +>1799842-2.3.7 ATATGCTGCAACTGAACCTGGATAACCCAATCG ->432732-2.3.0.2 +>432732-2.3.8 CCGGCAAGCCGTTGAGTGAAGAGGTTCGGGAAG >3069182-2.3.1 TACGTGCTTTATTGTGGATATCCATGTTCTTCC @@ -136,11 +136,11 @@ GGCAGGCGCGATTGCAACAACGTAAATTGCGCT TTGATGGCTTCATTACTATTACCGGCGGGAAGC >negative655112-0.1 GGCTCCATCCTTAGACCTGGTCGGTAAAATCTA ->negative773487-0.0.1 +>negative773487-0.4 TCTATGGATTTGTTGTGGTGTGGGGTGTGTAAT ->negative1804415-0.0.2 +>negative1804415-0.2 TTATTACCTATAAAGGTGAACAAGGCGCGAAAG ->negative1840727-0.0.3 +>negative1840727-0.3 GGTTACGCGGTACGCAGAGTAACGCCTGGCGCG >negative3640678-0.1.1 ACACGCCGGTGGAAGTGCTGCATTCCGGGCTAA @@ -150,25 +150,25 @@ ATGGCAGGTAGCCCAACTGGCGCGCCATCCGCA ACACGGGAATGTCGATGGTAAAGGCGTCGCCAT >negative316489-0 GCAGCCTGCCGCTGACTCCCTTTTTCTGGGGAT ->negative4105384-1.1 +>negative4105384-1 GATTTTTTTGCAACTGCTGCGATACGGCGCGAT ->negative555826-1.1.1 +>negative555826-1.1 AATTTAAGTTAATTACCGGTCCTTCTGGCTGCG ->negative2360997-1.1.2 +>negative2360997-1.2 CACTGAAGGCTTTACCCAGCGCTCTGGCATCTT ->negative4664137-1.1.3 +>negative4664137-1.3 AAGTCACCTGCGCGCCCTGGTCGAACGCGGCGA ->negative2166082-1.1.4 +>negative2166082-1.4 GCAATTCGCGGTAAAGGGTACTGGCAATAAGAC ->negative30192-1.2 +>negative30192-3 CGGGCGACGGTTCCGCCTGGCAACCTTTTGCTA ->negative4288272-1.2.1 +>negative4288272-3.1 CGGAACAGTATCTGGGCGCGAATTACTTTCCGG >negative2737027-2 ACGAATATGGGTTGCAGGACGTCCTGGATTGTA ->negative1215983-2.0.1 +>negative1215983-2.6 CGTTGTGGGAGCAGGGTGATGAGCAGGCGCACG ->negative4132985-2.0.2 +>negative4132985-2.7 CCAACCAGGTCTTTGCGGAAGTGTTACGCGGCA >negative146673-2.1 TACGTATGCAGGTACCGGCGCGGGTGGTGGCGT @@ -210,7 +210,7 @@ AGGCTGCGGCACAGAGCAAATGCAGAGCATGGT TTGCCACCCTTGTCACCGCCAGCGCTTTCGCTG >negative3092900-2.2.3.4 GCTGACCTTGCAGATACGCTTCGTTTTCTTTGT ->negative2723724-2.2.3.3.1 +>negative2723724-2.2.3.5 TTAATACACCGAACGGACATCTGGTCGCCAGCG >negative3437570-2.4 AGACATCTGGCGGCGTAAGCAGCATCCCCACAA @@ -222,9 +222,9 @@ TATTCACCAGCACCGGCGCCAGACTGGCGTCAT CGCCAAACAGCACAAACGGCAGGTCATTTGCCA >negative3062270-2.3 TTTCTTGATCGCTTCACGCATGGTCAGTTTTTC ->negative1799842-2.3.0.1 +>negative1799842-2.3.7 ATATGCTGCAACTGAATCTGGATAACCCAATCG ->negative432732-2.3.0.2 +>negative432732-2.3.8 CCGGCAAGCCGTTGAGCGAAGAGGTTCGGGAAG >negative3069182-2.3.1 TACGTGCTTTATTGTGAATATCCATGTTCTTCC diff --git a/bio_hansel/data/typhi/metadata.tsv b/bio_hansel/data/typhi/metadata.tsv index 6455036..0443d79 100644 --- a/bio_hansel/data/typhi/metadata.tsv +++ b/bio_hansel/data/typhi/metadata.tsv @@ -1,27 +1,27 @@ -subtype Alternate_Name -0 Typhi_1 -0.1 Typhi_0.1 -1.1 Typhi_1.1 -1.2 Typhi_1.2 -2 Typhi_2 -2.1 Typhi_2.1 -2.2 Typhi_2.2 -2.3 Typhi_3 -2.4 Typhi_2.4 -2.5 Typhi_2.5 -0.0.1 Typhi_0.0.1 -0.0.2 Typhi_0.0.2 -0.0.3 Typhi_0.0.3 +subtype Wong_et_al_genotype +0 Typhi_1.0.0 +0.1 Typhi_0.1.0 +1 Typhi_1.1.0 +3 Typhi_1.2.0 +2 Typhi_2.0.0 +2.1 Typhi_2.1.0 +2.2 Typhi_2.2.0 +2.3 Typhi_3.0.0 +2.4 Typhi_2.4.0 +2.5 Typhi_2.5.0 +0.4 Typhi_0.0.1 +0.2 Typhi_0.0.2 +0.3 Typhi_0.0.3 0.1.1 Typhi_0.1.1 0.1.2 Typhi_0.1.2 0.1.3 Typhi_0.1.3 -1.1.1 Typhi_1.1.1 -1.1.2 Typhi_1.1.2 -1.1.3 Typhi_1.1.3 -1.1.4 Typhi_1.1.4 -1.2.1 Typhi_1.2.1 -2.0.1 Typhi_2.0.1 -2.0.2 Typhi_2.0.2 +1.1 Typhi_1.1.1 +1.2 Typhi_1.1.2 +1.3 Typhi_1.1.3 +1.4 Typhi_1.1.4 +3.1 Typhi_1.2.1 +2.6 Typhi_2.0.1 +2.7 Typhi_2.0.2 2.1.1 Typhi_2.1.1 2.1.2 Typhi_2.1.2 2.1.3 Typhi_2.1.3 @@ -34,36 +34,36 @@ subtype Alternate_Name 2.2.1 Typhi_2.2.1 2.2.2 Typhi_2.2.2 2.2.2.1 Typhi_2.2.3 -2.2.3 Typhi_2.3 +2.2.3 Typhi_2.3.0 2.2.3.2 Typhi_2.3.2 2.2.3.2.1 Typhi_2.3.1 2.2.3.3 Typhi_2.3.3 -2.2.3.3.1 Typhi_2.3.5 +2.2.3.5 Typhi_2.3.5 2.2.3.4 Typhi_2.3.4 2.2.4 Typhi_2.2.4 -2.3.0.1 Typhi_3.0.1 -2.3.0.2 Typhi_3.0.2 -2.3.1 Typhi_3.1 +2.3.7 Typhi_3.0.1 +2.3.8 Typhi_3.0.2 +2.3.1 Typhi_3.1.0 2.3.1.1 Typhi_3.1.1 2.3.1.2 Typhi_3.1.2 -2.3.2 Typhi_3.2 +2.3.2 Typhi_3.2.0 2.3.2.1 Typhi_3.2.1 2.3.2.2 Typhi_3.2.2 -2.3.3 Typhi_3.3 +2.3.3 Typhi_3.3.0 2.3.3.1 Typhi_3.3.1 -2.3.4 Typhi_3.4 -2.3.5 Typhi_3.5 +2.3.4 Typhi_3.4.0 +2.3.5 Typhi_3.5.0 2.3.5.1 Typhi_3.5.1 2.3.5.2 Typhi_3.5.2 2.3.5.4 Typhi_3.5.4 2.3.5.4.1 Typhi_3.5.3 -2.3.6 Typhi_4 -2.3.6.1 Typhi_4.1 +2.3.6 Typhi_4.0.0 +2.3.6.1 Typhi_4.1.0 2.3.6.1.1 Typhi_4.1.1 -2.3.6.2 Typhi_4.2 +2.3.6.2 Typhi_4.2.0 2.3.6.2.1 Typhi_4.2.1 2.3.6.2.2 Typhi_4.2.2 2.3.6.2.3 Typhi_4.2.3 2.3.6.3 Typhi_4.3.1 2.4.1 Typhi_2.4.1 -2.5.1 Typhi_2.5.1 +2.5.1 Typhi_2.5.1 \ No newline at end of file diff --git a/bio_hansel/main.py b/bio_hansel/main.py index 77caa00..95801b8 100644 --- a/bio_hansel/main.py +++ b/bio_hansel/main.py @@ -52,7 +52,7 @@ def init_parser(): help='Input genome FASTA/FASTQ files (can be Gzipped)') parser.add_argument('-s', '--scheme', default='heidelberg', - help='Scheme to use for subtyping (built-in: "heidelberg", "enteritidis", "typhi", "tb_speciation"; OR user-specified: ' + help='Scheme to use for subtyping (built-in: "heidelberg", "enteritidis", "typhi", "typhimurium", "tb_speciation"; OR user-specified: ' '/path/to/user/scheme)') parser.add_argument('--scheme-name', help='Custom user-specified SNP substyping scheme name') diff --git a/bio_hansel/qc/checks.py b/bio_hansel/qc/checks.py index 02f9016..e4079ee 100644 --- a/bio_hansel/qc/checks.py +++ b/bio_hansel/qc/checks.py @@ -140,7 +140,7 @@ def is_mixed_subtype(st: Subtype, df: pd.DataFrame, *args) -> Tuple[Optional[str """ if not st.are_subtypes_consistent: return QC.FAIL, f'Mixed subtypes found: "{"; ".join(sorted(st.inconsistent_subtypes))}".' - conflicting_kmers = get_conflicting_kmers(st, df) + conflicting_kmers = get_conflicting_kmers(st.subtype, df, st.is_fastq_input()) if conflicting_kmers is None or conflicting_kmers.shape[0] == 0: return None, None @@ -250,7 +250,7 @@ def is_maybe_intermediate_subtype(st: Subtype, df: pd.DataFrame, p: SubtypingPar total_subtype_kmers = int(st.n_kmers_matching_subtype_expected) total_subtype_kmers_hits = int(st.n_kmers_matching_subtype) - conflicting_kmers = get_conflicting_kmers(st, df) + conflicting_kmers = get_conflicting_kmers(st.subtype, df, st.is_fastq_input()) num_pos_kmers, num_neg_kmers = get_num_pos_neg_kmers(st, df) obs = int(st.n_kmers_matching_all) exp = int(st.n_kmers_matching_all_expected) diff --git a/bio_hansel/qc/utils.py b/bio_hansel/qc/utils.py index 0f2ede8..61542dd 100644 --- a/bio_hansel/qc/utils.py +++ b/bio_hansel/qc/utils.py @@ -1,11 +1,25 @@ -from typing import Tuple, Optional, List, Any, Dict +from typing import Tuple, Optional, List, Any, Dict, Iterable from pandas import DataFrame from ..subtype import Subtype -def get_conflicting_kmers(st: Subtype, df: DataFrame) -> Optional[DataFrame]: +def component_subtypes(subtype: str) -> Iterable[str]: + """Generate component subtypes from a subtype. + + Args: + subtype: Subtype string, e.g. "4.2.1.1" + Yields: + Component subtypes (e.g. for subtype "4.2.1.1", will yield + ['4', '4.2', '4.2.1', '4.2.1.1']) + """ + split_subtype = subtype.split('.') + for i, x in enumerate(split_subtype): + yield '.'.join(split_subtype[:i+1]) + + +def get_conflicting_kmers(subtype: str, df: DataFrame, is_fastq_input: bool = True) -> Optional[DataFrame]: """ Get positive and negative kmers that both are present for a subtype. Find positive and negative kmers for the same refposition/target site in the results `df`. @@ -17,10 +31,9 @@ def get_conflicting_kmers(st: Subtype, df: DataFrame) -> Optional[DataFrame]: Returns: DataFrame of conflicting positive and negative kmers """ - if st.is_fastq_input(): - dfst = df[(df['subtype'] == str(st.subtype)) & (df['is_kmer_freq_okay'])] - else: # fasta files - dfst = df[(df['subtype'] == str(st.subtype))] + dfst = df[(df['subtype'].isin(list(component_subtypes(subtype))))] + if is_fastq_input: + dfst = dfst[dfst['is_kmer_freq_okay']] pos_kmer_positions = dfst[dfst['is_pos_kmer']]['refposition'] neg_kmers = dfst[~dfst['is_pos_kmer']] diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100755 index 0000000..4ea07ea --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1,12 @@ +pip==18.1 +bumpversion==0.5.3 +wheel==0.32.1 +watchdog==0.9.0 +flake8==3.5.0 +tox==3.5.2 +coverage==4.5.1 +Sphinx==1.8.1 +twine==1.12.1 + +pytest==3.8.2 +pytest-runner==4.2 diff --git a/setup.py b/setup.py index 6700484..60fbab5 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,14 @@ +#!/usr/bin/env python # -*- coding: utf-8 -*- -from distutils.core import setup - -from setuptools import find_packages +from setuptools import setup, find_packages from bio_hansel import __version__, program_name, program_desc classifiers = """ Development Status :: 3 - Alpha Environment :: Console -License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+) +License :: OSI Approved :: Apache Software License Intended Audience :: Science/Research Topic :: Scientific/Engineering Topic :: Scientific/Engineering :: Bio-Informatics @@ -19,15 +18,19 @@ Operating System :: POSIX :: Linux """.strip().split('\n') +with open('README.rst') as readme_file: + readme = readme_file.read() + setup( name=program_name, version=__version__, packages=find_packages(exclude=['tests']), url='https://github.com/phac-nml/{}'.format(program_name), - license='Apache v2.0', + license='Apache Software License 2.0', author='Peter Kruczkiewicz', author_email='peter.kruczkiewicz@gmail.com', description=program_desc, + long_description=readme, keywords='Salmonella enterica Heidelberg Enteritidis SNP kmer subtyping Aho-Corasick', classifiers=classifiers, package_dir={program_name: program_name}, diff --git a/tests/data/qc/conflicting_subtypes/fail.tsv b/tests/data/qc/conflicting_subtypes/fail.tsv new file mode 100644 index 0000000..ec81f8a --- /dev/null +++ b/tests/data/qc/conflicting_subtypes/fail.tsv @@ -0,0 +1,64 @@ +refposition subtype is_pos_kmer is_kmer_freq_okay +931123 4 True True +62657 4.1 True True +891756 4.1.2 True True +3216553 1.1.1.1 False True +3479545 1.2.1 False True +3470377 1.2.2 False True +3466426 4.6 False True +3388166 4.3.2.1 False True +3273107 3 False True +3021283 1.1.1 False True +3722702 3.1.2 False True +2875883 4.6.2.2 False True +2874344 3.1.2.2 False True +2831482 8.1 False True +2694560 4.4.1.2 False True +2622402 1.1.2 False True +3570528 4.6.2.1 False True +62657 4.1 False True +2411730 4.2 False True +3836274 2.2.1.2 False True +3836739 4.8 False True +3977226 4.3.4 False True +4125058 4.6.2 False True +4151558 4.4.1 False True +4229087 4.1.1.3 False True +4246508 4.4.2 False True +4248115 2.2.1.1 False True +4249732 4.7 False True +4260268 4.6.1 False True +4307886 4.4 False True +4316114 4.3.2 False True +4398141 4.3.4.1 False True +2505085 2.2 False True +1881090 2.1 False True +1882180 8 False True +874787 4.6.1.1 False True +107794 4.1.2.1 False True +346693 2.2.2 False True +355181 4.4.1.1 False True +403364 4.3.3 False True +497491 2 False True +514245 4.1.1 False True +541048 4.1.1.2 False True +615614 4.3.1 False True +615938 1 False True +764995 4.3 False True +783601 4.2.1 False True +797736 2.2.1 False True +1084911 3.1.1 False True +1850119 4.1.1.1 False True +1132368 4.3.4.2 False True +1137518 7 False True +1237818 3.1.2.1 False True +1455780 4.2.2.1 False True +1487796 4.2.2 False True +1491275 1.1.3 False True +1501468 4.6.1.2 False True +1502120 4.3.4.2.1 False True +1719757 4.5 False True +1759252 4.9 False True +1799921 5 False True +1816587 8.2 False True +4404247 1.1 False True diff --git a/tests/data/qc/conflicting_subtypes/pass.tsv b/tests/data/qc/conflicting_subtypes/pass.tsv new file mode 100644 index 0000000..87547e5 --- /dev/null +++ b/tests/data/qc/conflicting_subtypes/pass.tsv @@ -0,0 +1,63 @@ +refposition subtype is_pos_kmer is_kmer_freq_okay +931123 4 True True +62657 4.1 True True +891756 4.1.2 True True +3216553 1.1.1.1 False True +3479545 1.2.1 False True +3470377 1.2.2 False True +3466426 4.6 False True +3388166 4.3.2.1 False True +3273107 3 False True +3021283 1.1.1 False True +3722702 3.1.2 False True +2875883 4.6.2.2 False True +2874344 3.1.2.2 False True +2831482 8.1 False True +2694560 4.4.1.2 False True +2622402 1.1.2 False True +3570528 4.6.2.1 False True +2411730 4.2 False True +3836274 2.2.1.2 False True +3836739 4.8 False True +3977226 4.3.4 False True +4125058 4.6.2 False True +4151558 4.4.1 False True +4229087 4.1.1.3 False True +4246508 4.4.2 False True +4248115 2.2.1.1 False True +4249732 4.7 False True +4260268 4.6.1 False True +4307886 4.4 False True +4316114 4.3.2 False True +4398141 4.3.4.1 False True +2505085 2.2 False True +1881090 2.1 False True +1882180 8 False True +874787 4.6.1.1 False True +107794 4.1.2.1 False True +346693 2.2.2 False True +355181 4.4.1.1 False True +403364 4.3.3 False True +497491 2 False True +514245 4.1.1 False True +541048 4.1.1.2 False True +615614 4.3.1 False True +615938 1 False True +764995 4.3 False True +783601 4.2.1 False True +797736 2.2.1 False True +1084911 3.1.1 False True +1850119 4.1.1.1 False True +1132368 4.3.4.2 False True +1137518 7 False True +1237818 3.1.2.1 False True +1455780 4.2.2.1 False True +1487796 4.2.2 False True +1491275 1.1.3 False True +1501468 4.6.1.2 False True +1502120 4.3.4.2.1 False True +1719757 4.5 False True +1759252 4.9 False True +1799921 5 False True +1816587 8.2 False True +4404247 1.1 False True diff --git a/tests/test_qc_utils.py b/tests/test_qc_utils.py index edd1ff4..0fd1886 100644 --- a/tests/test_qc_utils.py +++ b/tests/test_qc_utils.py @@ -2,7 +2,12 @@ import pandas as pd -from bio_hansel.qc.utils import get_mixed_subtype_kmer_counts + +from bio_hansel.qc.utils import get_mixed_subtype_kmer_counts, component_subtypes, get_conflicting_kmers + + +fail_tsv = 'tests/data/qc/conflicting_subtypes/fail.tsv' +pass_tsv = 'tests/data/qc/conflicting_subtypes/pass.tsv' def test_get_mixed_subtype_kmer_counts(): @@ -22,3 +27,21 @@ def test_get_mixed_subtype_kmer_counts(): assert(int(st_pos_kmers.get('2.1')) == 5) assert(int(st_pos_kmers.get('2.2')) == 3) assert(int(st_pos_kmers.get('0')) == 1) + + +def test_component_subtypes(): + assert list(component_subtypes('4.2.1.1')) == ['4', '4.2', '4.2.1', '4.2.1.1'] + assert list(component_subtypes('1')) == ['1'] + + +def test_get_conflicting_kmers(): + df_pass = pd.read_csv(pass_tsv, sep='\t') + df_pass_result = get_conflicting_kmers('4.1.2', df_pass, True) + assert df_pass_result.shape[0] == 0, 'Must be no conflicting kmers' + df_fail = pd.read_csv(fail_tsv, sep='\t') + df_fail_result = get_conflicting_kmers('4.1.2', df_fail, True) + df_fail_result.reset_index(inplace=True) + assert df_fail_result.shape[0] == 1, 'Must be one conflicting kmer' + assert df_fail_result.refposition[0] == 62657 + assert df_fail_result.subtype[0] == '4.1' + assert df_fail_result.is_pos_kmer[0] == False diff --git a/tests/test_subtyping_reads.py b/tests/test_subtyping_reads.py index 88151c6..59340a3 100644 --- a/tests/test_subtyping_reads.py +++ b/tests/test_subtyping_reads.py @@ -100,7 +100,7 @@ def subtype_tb_AP018036_pass(): subtype='2.2.1', file_path=fasta_tb_pass, are_subtypes_consistent=True, - n_kmers_matching_all=61, + n_kmers_matching_all=62, n_kmers_matching_all_expected='62', n_kmers_matching_positive=3, n_kmers_matching_positive_expected='3', diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..cfa182b --- /dev/null +++ b/tox.ini @@ -0,0 +1,23 @@ +[tox] +envlist = py36, flake8 + +[travis] +python = + 3.6: py36 + +[testenv:flake8] +basepython = python +deps = flake8 +commands = flake8 biohansel + +[testenv] +setenv = + PYTHONPATH = {toxinidir} +deps = + -r{toxinidir}/requirements_dev.txt +; If you want to make tox run the tests with the same versions, create a +; requirements.txt with the pinned versions and uncomment the following line: +; -r{toxinidir}/requirements.txt +commands = + pip install -U pip + py.test --basetemp={envtmpdir}