diff --git a/src/pyokit/datastruct/genomicInterval.py b/src/pyokit/datastruct/genomicInterval.py index d7b478c..95793fe 100755 --- a/src/pyokit/datastruct/genomicInterval.py +++ b/src/pyokit/datastruct/genomicInterval.py @@ -54,6 +54,36 @@ class InvalidStartIndexError(GenomicIntervalError): # FUNCTIONS FOR MANIPULATING COLLECTIONS OF GENOMIC INTERVALS # ############################################################################### +def jaccardIndex(s1, s2, stranded=False): + """ + Compute the Jaccard index for two collections of genomic intervals + + :param s1: the first set of genomic intervals + :param s2: the second set of genomic intervals + :param stranded: if True, treat regions on different strands as not + intersecting each other, even if they occupy the same + genomic region. + :return: Jaccard index + """ + + def count(s): + """ sum the size of regions in s. """ + tot = 0 + for r in s: + tot += len(r) + return tot + + if stranded: + raise GenomicIntervalError("Sorry, stranded mode for computing Jaccard " + + "index hasn't been implemented yet.") + + s1 = collapseRegions(s1) + s2 = collapseRegions(s2) + intersection = regionsIntersection(s1, s2) + c_i = count(intersection) + return c_i / float(count(s1) + count(s2) - c_i) + + def intervalTreesFromList(inElements, verbose=False, openEnded=False): """ build a dictionary, indexed by chrom name, of interval trees for each chrom. @@ -197,7 +227,7 @@ def get_first_matching_index(s, proc_strands): return res -def regionsIntersection(s1, s2): +def regionsIntersection(s1, s2, collapse=True): """ given two lists of genomic regions with chromosome, start and end coordinates, return a new list of regions which is the intersection of those diff --git a/src/pyokit/scripts/genomicIntJaccard.py b/src/pyokit/scripts/genomicIntJaccard.py new file mode 100644 index 0000000..556e723 --- /dev/null +++ b/src/pyokit/scripts/genomicIntJaccard.py @@ -0,0 +1,147 @@ +#!/usr/bin/python + +""" + Date of Creation: Dec 2015 + Description: Compute the Jaccard index for regions in two files + + Copyright (C) 2010-2014 + Philip J. Uren + + Authors: Philip J. Uren + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +""" + +# python imports +import os +import sys +import unittest +import StringIO + +# for unit testing +import mock + +# pyokit imports +from pyokit.interface.cli import CLI, Option +from pyokit.io.bedIterators import BEDIterator +from pyokit.datastruct.genomicInterval import jaccardIndex +from pyokit.util.testing import build_mock_open_side_effect + + +############################################################################### +# USER INTERFACE # +############################################################################### + +def getUI(args): + """ + build and return a UI object for this script. + + :param args: raw arguments to parse + """ + programName = os.path.basename(sys.argv[0]) + longDescription = "Compute the Jaccard index for two sets of genomic " +\ + "intervals." + shortDescription = longDescription + + ui = CLI(programName, shortDescription, longDescription) + ui.minArgs = 2 + ui.maxArgs = 2 + ui.addOption(Option(short="v", long="verbose", + description="output additional messages to stderr " + + "about run", required=False)) + ui.addOption(Option(short="s", long="stranded", + description="treat regions on separate strands as " + + "disjoint, even if they overlap", + required=False)) + ui.addOption(Option(short="h", long="help", + description="show this help message ", special=True)) + ui.addOption(Option(short="u", long="test", + description="run unit tests ", special=True)) + + ui.parseCommandLine(args) + return ui + + +################################################################################ +# COMMAND LINE PROCESSING AND DISPATCH # +################################################################################ + +def main(args): + """ + main entry point for the GenomicIntJaccard script. + + :param args: the arguments for this script, as a list of string. Should + already have had things like the script name stripped. That + is, if there are no args provided, this should be an empty + list. + """ + # get options and arguments + ui = getUI(args) + + if ui.optionIsSet("test"): + # just run unit tests + unittest.main(argv=[sys.argv[0]]) + elif ui.optionIsSet("help"): + # just show help + ui.usage() + else: + verbose = ui.optionIsSet("verbose") + stranded = ui.optionIsSet("stranded") + + if stranded: + sys.stderr.write("Sorry, stranded mode hasn't been implemented yet.") + sys.exit() + + # we required two input files, so we know these will be present... + regions_1 = [e for e in BEDIterator(ui.getArgument(0), verbose=verbose)] + regions_2 = [e for e in BEDIterator(ui.getArgument(1), verbose=verbose)] + + print jaccardIndex(regions_1, regions_2) + + +############################################################################### +# UNIT TESTS FOR THIS MODULE # +############################################################################### + +class TestGenomicIntJaccard(unittest.TestCase): + + def setUp(self): + f1_conts = "\n".join(["\t".join(["chr1", "10", "20", "F1_R1", "0", "+"]), + "\t".join(["chr1", "70", "80", "F2_R2", "0", "+"]), + "\t".join(["chr2", "05", "10", "F2_R2", "0", "+"]), + "\t".join(["chr2", "70", "75", "F2_R2", "0", "+"]), + "\t".join(["chr2", "90", "95", "F2_R2", "0", "+"])]) + f2_conts = "\n".join(["\t".join(["chr1", "07", "12", "F1_R1", "0", "+"]), + "\t".join(["chr1", "67", "70", "F2_R2", "0", "+"]), + "\t".join(["chr1", "75", "85", "F2_R2", "0", "+"]), + "\t".join(["chr2", "20", "30", "F2_R2", "0", "+"]), + "\t".join(["chr2", "73", "92", "F2_R2", "0", "+"])]) + string_d = {"file1.bed": f1_conts, + "file2.bed": f2_conts} + self.mock_open_side_effect = build_mock_open_side_effect(string_d, {}) + + @mock.patch('__builtin__.open') + @mock.patch('sys.stdout', new_callable=StringIO.StringIO) + def test_simple(self, mock_stdout, mock_open): + mock_open.side_effect = self.mock_open_side_effect + main(["file1.bed", "file2.bed"]) + self.assertAlmostEqual(float(mock_stdout.getvalue()), 0.1549296) + + +############################################################################### +# ENTRY POINT WHEN RUN AS A STAND-ALONE MODULE # +############################################################################### + +if __name__ == "__main__": + unittest.main(argv=[sys.argv[0]]) diff --git a/src/pyokit/scripts/genomicIntersection.py b/src/pyokit/scripts/genomicIntersection.py new file mode 100644 index 0000000..c0609b0 --- /dev/null +++ b/src/pyokit/scripts/genomicIntersection.py @@ -0,0 +1,165 @@ +#!/usr/bin/python + +""" + Date of Creation: Dec 2015 + Description: Get the intersection of genomic regions in one or more + files + + Copyright (C) 2010-2015 + Philip J. Uren + + Authors: Philip J. Uren + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +""" + +# standard python imports +import os +import sys +import unittest +import StringIO + +# for unit testing +import mock + +# pyokit imports +from pyokit.interface.cli import CLI, Option +from pyokit.io.bedIterators import BEDIterator +from pyokit.util.testing import build_mock_open_side_effect +from pyokit.datastruct.genomicInterval import regionsIntersection + + +################################################################################ +# USER INTERFACE # +################################################################################ + +def getUI(args): + """ + build and return a UI object for this script. + + :param args: raw arguments to parse + """ + programName = os.path.basename(sys.argv[0]) + longDescription = "takes a file with a list of p-values and applies " +\ + "Benjamini and Hochberg FDR to convert to q-values " + shortDescription = "takes a file with a list of p-values and applies " +\ + "Benjamini and Hochberg FDR to convert to q-values " + + ui = CLI(programName, shortDescription, longDescription) + ui.minArgs = 2 + ui.maxArgs = 2 + ui.addOption(Option(short="o", long="output", argName="filename", + description="output to given file, else stdout", + required=False, type=str)) + ui.addOption(Option(short="s", long="stranded", + description="treat regions on separate strands as " + + "disjoint, even if they overlap", + required=False)) + ui.addOption(Option(short="v", long="verbose", + description="output additional messages to stderr " + + "about run", required=False)) + ui.addOption(Option(short="h", long="help", + description="show this help message ", special=True)) + ui.addOption(Option(short="u", long="test", + description="run unit tests ", special=True)) + + ui.parseCommandLine(args) + return ui + + +################################################################################ +# COMMAND LINE PROCESSING AND DISPATCH # +################################################################################ + +def main(args): + """ + main entry point for the GenomicIntIntersection script. + + :param args: the arguments for this script, as a list of string. Should + already have had things like the script name stripped. That + is, if there are no args provided, this should be an empty + list. + """ + # get options and arguments + ui = getUI(args) + + if ui.optionIsSet("test"): + # just run unit tests + unittest.main(argv=[sys.argv[0]]) + elif ui.optionIsSet("help"): + # just show help + ui.usage() + else: + verbose = ui.optionIsSet("verbose") + + # stranded? + stranded = ui.optionIsSet("stranded") + if stranded: + sys.stderr.write("Sorry, stranded mode hasn't been implemented yet.") + sys.exit() + + # get output handle + out_fh = sys.stdout + if ui.optionIsSet("output"): + out_fh = open(ui.getValue("output"), "w") + + # get input file-handles -- we know we'll get exactly two, since we + # specified it in the UI definition + regions_1 = [x for x in BEDIterator(ui.getArgument(0), verbose=verbose)] + regions_2 = [x for x in BEDIterator(ui.getArgument(1), verbose=verbose)] + + for r in regionsIntersection(regions_1, regions_2): + out_fh.write(str(r) + "\n") + + +############################################################################### +# UNIT TESTS # +############################################################################### + +class TestGenomicIntIntersection(unittest.TestCase): + + """Unit tests for the GenomicIntIntersection program/script.""" + + def setUp(self): + f1_conts = "\n".join(["\t".join(["chr1", "10", "20", "F1_R1", "0", "+"]), + "\t".join(["chr1", "70", "80", "F2_R2", "0", "+"]), + "\t".join(["chr2", "05", "10", "F2_R2", "0", "+"]), + "\t".join(["chr2", "70", "75", "F2_R2", "0", "+"]), + "\t".join(["chr2", "90", "95", "F2_R2", "0", "+"])]) + f2_conts = "\n".join(["\t".join(["chr1", "07", "12", "F1_R1", "0", "+"]), + "\t".join(["chr1", "67", "70", "F2_R2", "0", "+"]), + "\t".join(["chr1", "75", "85", "F2_R2", "0", "+"]), + "\t".join(["chr2", "20", "30", "F2_R2", "0", "+"]), + "\t".join(["chr2", "73", "92", "F2_R2", "0", "+"])]) + self.str_d = {"file1.bed": f1_conts, "file2.bed": f2_conts} + + @mock.patch('__builtin__.open') + def testTwoFiles(self, mock_open): + outfh = StringIO.StringIO() + stream_d = {"out.bed": outfh} + mock_open.side_effect = build_mock_open_side_effect(self.str_d, stream_d) + main(["-o", "out.bed", "file1.bed", "file2.bed"]) + expect = "\n".join(["\t".join(["chr1", "10", "12", "X", "0", "+"]), + "\t".join(["chr1", "75", "80", "X", "0", "+"]), + "\t".join(["chr2", "73", "75", "X", "0", "+"]), + "\t".join(["chr2", "90", "92", "X", "0", "+"])]) + "\n" + self.assertEqual(outfh.getvalue(), expect) + + +############################################################################### +# ENTRY POINT WHEN RUN AS A STAND-ALONE MODULE # +############################################################################### + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/src/test.py b/src/test.py index eaba3ff..8096ee3 100755 --- a/src/test.py +++ b/src/test.py @@ -92,6 +92,8 @@ from pyokit.scripts.overlapProfile import TestOverlapProfile from pyokit.scripts.readCounts import ConvertJunctionsUnitTests from pyokit.scripts.readCountExonToGene import ExonToGeneReadCountTests +from pyokit.scripts.genomicIntJaccard import TestGenomicIntJaccard +from pyokit.scripts.genomicIntersection import TestGenomicIntIntersection from pyokit.scripts.pyokitMain import PyokitSmokeTests @@ -151,6 +153,8 @@ sys.stderr.write(" " + str(TestOverlapProfile) + "\n") sys.stderr.write(" " + str(ConvertJunctionsUnitTests) + "\n") sys.stderr.write(" " + str(ExonToGeneReadCountTests) + "\n") + sys.stderr.write(" " + str(TestGenomicIntJaccard) + "\n") + sys.stderr.write(" " + str(TestGenomicIntIntersection) + "\n") sys.stderr.write(" " + str(PyokitSmokeTests) + "\n\n") sys.stderr.write("\n\n RUNNING TESTS \n\n")