diff --git a/src/pyokit/datastruct/genomicInterval.py b/src/pyokit/datastruct/genomicInterval.py
index d7b478c..95793fe 100755
--- a/src/pyokit/datastruct/genomicInterval.py
+++ b/src/pyokit/datastruct/genomicInterval.py
@@ -54,6 +54,36 @@ class InvalidStartIndexError(GenomicIntervalError):
# FUNCTIONS FOR MANIPULATING COLLECTIONS OF GENOMIC INTERVALS #
###############################################################################
+def jaccardIndex(s1, s2, stranded=False):
+ """
+ Compute the Jaccard index for two collections of genomic intervals
+
+ :param s1: the first set of genomic intervals
+ :param s2: the second set of genomic intervals
+ :param stranded: if True, treat regions on different strands as not
+ intersecting each other, even if they occupy the same
+ genomic region.
+ :return: Jaccard index
+ """
+
+ def count(s):
+ """ sum the size of regions in s. """
+ tot = 0
+ for r in s:
+ tot += len(r)
+ return tot
+
+ if stranded:
+ raise GenomicIntervalError("Sorry, stranded mode for computing Jaccard " +
+ "index hasn't been implemented yet.")
+
+ s1 = collapseRegions(s1)
+ s2 = collapseRegions(s2)
+ intersection = regionsIntersection(s1, s2)
+ c_i = count(intersection)
+ return c_i / float(count(s1) + count(s2) - c_i)
+
+
def intervalTreesFromList(inElements, verbose=False, openEnded=False):
"""
build a dictionary, indexed by chrom name, of interval trees for each chrom.
@@ -197,7 +227,7 @@ def get_first_matching_index(s, proc_strands):
return res
-def regionsIntersection(s1, s2):
+def regionsIntersection(s1, s2, collapse=True):
"""
given two lists of genomic regions with chromosome, start and end
coordinates, return a new list of regions which is the intersection of those
diff --git a/src/pyokit/scripts/genomicIntJaccard.py b/src/pyokit/scripts/genomicIntJaccard.py
new file mode 100644
index 0000000..556e723
--- /dev/null
+++ b/src/pyokit/scripts/genomicIntJaccard.py
@@ -0,0 +1,147 @@
+#!/usr/bin/python
+
+"""
+ Date of Creation: Dec 2015
+ Description: Compute the Jaccard index for regions in two files
+
+ Copyright (C) 2010-2014
+ Philip J. Uren
+
+ Authors: Philip J. Uren
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+"""
+
+# python imports
+import os
+import sys
+import unittest
+import StringIO
+
+# for unit testing
+import mock
+
+# pyokit imports
+from pyokit.interface.cli import CLI, Option
+from pyokit.io.bedIterators import BEDIterator
+from pyokit.datastruct.genomicInterval import jaccardIndex
+from pyokit.util.testing import build_mock_open_side_effect
+
+
+###############################################################################
+# USER INTERFACE #
+###############################################################################
+
+def getUI(args):
+ """
+ build and return a UI object for this script.
+
+ :param args: raw arguments to parse
+ """
+ programName = os.path.basename(sys.argv[0])
+ longDescription = "Compute the Jaccard index for two sets of genomic " +\
+ "intervals."
+ shortDescription = longDescription
+
+ ui = CLI(programName, shortDescription, longDescription)
+ ui.minArgs = 2
+ ui.maxArgs = 2
+ ui.addOption(Option(short="v", long="verbose",
+ description="output additional messages to stderr " +
+ "about run", required=False))
+ ui.addOption(Option(short="s", long="stranded",
+ description="treat regions on separate strands as " +
+ "disjoint, even if they overlap",
+ required=False))
+ ui.addOption(Option(short="h", long="help",
+ description="show this help message ", special=True))
+ ui.addOption(Option(short="u", long="test",
+ description="run unit tests ", special=True))
+
+ ui.parseCommandLine(args)
+ return ui
+
+
+################################################################################
+# COMMAND LINE PROCESSING AND DISPATCH #
+################################################################################
+
+def main(args):
+ """
+ main entry point for the GenomicIntJaccard script.
+
+ :param args: the arguments for this script, as a list of string. Should
+ already have had things like the script name stripped. That
+ is, if there are no args provided, this should be an empty
+ list.
+ """
+ # get options and arguments
+ ui = getUI(args)
+
+ if ui.optionIsSet("test"):
+ # just run unit tests
+ unittest.main(argv=[sys.argv[0]])
+ elif ui.optionIsSet("help"):
+ # just show help
+ ui.usage()
+ else:
+ verbose = ui.optionIsSet("verbose")
+ stranded = ui.optionIsSet("stranded")
+
+ if stranded:
+ sys.stderr.write("Sorry, stranded mode hasn't been implemented yet.")
+ sys.exit()
+
+ # we required two input files, so we know these will be present...
+ regions_1 = [e for e in BEDIterator(ui.getArgument(0), verbose=verbose)]
+ regions_2 = [e for e in BEDIterator(ui.getArgument(1), verbose=verbose)]
+
+ print jaccardIndex(regions_1, regions_2)
+
+
+###############################################################################
+# UNIT TESTS FOR THIS MODULE #
+###############################################################################
+
+class TestGenomicIntJaccard(unittest.TestCase):
+
+ def setUp(self):
+ f1_conts = "\n".join(["\t".join(["chr1", "10", "20", "F1_R1", "0", "+"]),
+ "\t".join(["chr1", "70", "80", "F2_R2", "0", "+"]),
+ "\t".join(["chr2", "05", "10", "F2_R2", "0", "+"]),
+ "\t".join(["chr2", "70", "75", "F2_R2", "0", "+"]),
+ "\t".join(["chr2", "90", "95", "F2_R2", "0", "+"])])
+ f2_conts = "\n".join(["\t".join(["chr1", "07", "12", "F1_R1", "0", "+"]),
+ "\t".join(["chr1", "67", "70", "F2_R2", "0", "+"]),
+ "\t".join(["chr1", "75", "85", "F2_R2", "0", "+"]),
+ "\t".join(["chr2", "20", "30", "F2_R2", "0", "+"]),
+ "\t".join(["chr2", "73", "92", "F2_R2", "0", "+"])])
+ string_d = {"file1.bed": f1_conts,
+ "file2.bed": f2_conts}
+ self.mock_open_side_effect = build_mock_open_side_effect(string_d, {})
+
+ @mock.patch('__builtin__.open')
+ @mock.patch('sys.stdout', new_callable=StringIO.StringIO)
+ def test_simple(self, mock_stdout, mock_open):
+ mock_open.side_effect = self.mock_open_side_effect
+ main(["file1.bed", "file2.bed"])
+ self.assertAlmostEqual(float(mock_stdout.getvalue()), 0.1549296)
+
+
+###############################################################################
+# ENTRY POINT WHEN RUN AS A STAND-ALONE MODULE #
+###############################################################################
+
+if __name__ == "__main__":
+ unittest.main(argv=[sys.argv[0]])
diff --git a/src/pyokit/scripts/genomicIntersection.py b/src/pyokit/scripts/genomicIntersection.py
new file mode 100644
index 0000000..c0609b0
--- /dev/null
+++ b/src/pyokit/scripts/genomicIntersection.py
@@ -0,0 +1,165 @@
+#!/usr/bin/python
+
+"""
+ Date of Creation: Dec 2015
+ Description: Get the intersection of genomic regions in one or more
+ files
+
+ Copyright (C) 2010-2015
+ Philip J. Uren
+
+ Authors: Philip J. Uren
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+"""
+
+# standard python imports
+import os
+import sys
+import unittest
+import StringIO
+
+# for unit testing
+import mock
+
+# pyokit imports
+from pyokit.interface.cli import CLI, Option
+from pyokit.io.bedIterators import BEDIterator
+from pyokit.util.testing import build_mock_open_side_effect
+from pyokit.datastruct.genomicInterval import regionsIntersection
+
+
+################################################################################
+# USER INTERFACE #
+################################################################################
+
+def getUI(args):
+ """
+ build and return a UI object for this script.
+
+ :param args: raw arguments to parse
+ """
+ programName = os.path.basename(sys.argv[0])
+ longDescription = "takes a file with a list of p-values and applies " +\
+ "Benjamini and Hochberg FDR to convert to q-values "
+ shortDescription = "takes a file with a list of p-values and applies " +\
+ "Benjamini and Hochberg FDR to convert to q-values "
+
+ ui = CLI(programName, shortDescription, longDescription)
+ ui.minArgs = 2
+ ui.maxArgs = 2
+ ui.addOption(Option(short="o", long="output", argName="filename",
+ description="output to given file, else stdout",
+ required=False, type=str))
+ ui.addOption(Option(short="s", long="stranded",
+ description="treat regions on separate strands as " +
+ "disjoint, even if they overlap",
+ required=False))
+ ui.addOption(Option(short="v", long="verbose",
+ description="output additional messages to stderr " +
+ "about run", required=False))
+ ui.addOption(Option(short="h", long="help",
+ description="show this help message ", special=True))
+ ui.addOption(Option(short="u", long="test",
+ description="run unit tests ", special=True))
+
+ ui.parseCommandLine(args)
+ return ui
+
+
+################################################################################
+# COMMAND LINE PROCESSING AND DISPATCH #
+################################################################################
+
+def main(args):
+ """
+ main entry point for the GenomicIntIntersection script.
+
+ :param args: the arguments for this script, as a list of string. Should
+ already have had things like the script name stripped. That
+ is, if there are no args provided, this should be an empty
+ list.
+ """
+ # get options and arguments
+ ui = getUI(args)
+
+ if ui.optionIsSet("test"):
+ # just run unit tests
+ unittest.main(argv=[sys.argv[0]])
+ elif ui.optionIsSet("help"):
+ # just show help
+ ui.usage()
+ else:
+ verbose = ui.optionIsSet("verbose")
+
+ # stranded?
+ stranded = ui.optionIsSet("stranded")
+ if stranded:
+ sys.stderr.write("Sorry, stranded mode hasn't been implemented yet.")
+ sys.exit()
+
+ # get output handle
+ out_fh = sys.stdout
+ if ui.optionIsSet("output"):
+ out_fh = open(ui.getValue("output"), "w")
+
+ # get input file-handles -- we know we'll get exactly two, since we
+ # specified it in the UI definition
+ regions_1 = [x for x in BEDIterator(ui.getArgument(0), verbose=verbose)]
+ regions_2 = [x for x in BEDIterator(ui.getArgument(1), verbose=verbose)]
+
+ for r in regionsIntersection(regions_1, regions_2):
+ out_fh.write(str(r) + "\n")
+
+
+###############################################################################
+# UNIT TESTS #
+###############################################################################
+
+class TestGenomicIntIntersection(unittest.TestCase):
+
+ """Unit tests for the GenomicIntIntersection program/script."""
+
+ def setUp(self):
+ f1_conts = "\n".join(["\t".join(["chr1", "10", "20", "F1_R1", "0", "+"]),
+ "\t".join(["chr1", "70", "80", "F2_R2", "0", "+"]),
+ "\t".join(["chr2", "05", "10", "F2_R2", "0", "+"]),
+ "\t".join(["chr2", "70", "75", "F2_R2", "0", "+"]),
+ "\t".join(["chr2", "90", "95", "F2_R2", "0", "+"])])
+ f2_conts = "\n".join(["\t".join(["chr1", "07", "12", "F1_R1", "0", "+"]),
+ "\t".join(["chr1", "67", "70", "F2_R2", "0", "+"]),
+ "\t".join(["chr1", "75", "85", "F2_R2", "0", "+"]),
+ "\t".join(["chr2", "20", "30", "F2_R2", "0", "+"]),
+ "\t".join(["chr2", "73", "92", "F2_R2", "0", "+"])])
+ self.str_d = {"file1.bed": f1_conts, "file2.bed": f2_conts}
+
+ @mock.patch('__builtin__.open')
+ def testTwoFiles(self, mock_open):
+ outfh = StringIO.StringIO()
+ stream_d = {"out.bed": outfh}
+ mock_open.side_effect = build_mock_open_side_effect(self.str_d, stream_d)
+ main(["-o", "out.bed", "file1.bed", "file2.bed"])
+ expect = "\n".join(["\t".join(["chr1", "10", "12", "X", "0", "+"]),
+ "\t".join(["chr1", "75", "80", "X", "0", "+"]),
+ "\t".join(["chr2", "73", "75", "X", "0", "+"]),
+ "\t".join(["chr2", "90", "92", "X", "0", "+"])]) + "\n"
+ self.assertEqual(outfh.getvalue(), expect)
+
+
+###############################################################################
+# ENTRY POINT WHEN RUN AS A STAND-ALONE MODULE #
+###############################################################################
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
diff --git a/src/test.py b/src/test.py
index eaba3ff..8096ee3 100755
--- a/src/test.py
+++ b/src/test.py
@@ -92,6 +92,8 @@
from pyokit.scripts.overlapProfile import TestOverlapProfile
from pyokit.scripts.readCounts import ConvertJunctionsUnitTests
from pyokit.scripts.readCountExonToGene import ExonToGeneReadCountTests
+from pyokit.scripts.genomicIntJaccard import TestGenomicIntJaccard
+from pyokit.scripts.genomicIntersection import TestGenomicIntIntersection
from pyokit.scripts.pyokitMain import PyokitSmokeTests
@@ -151,6 +153,8 @@
sys.stderr.write(" " + str(TestOverlapProfile) + "\n")
sys.stderr.write(" " + str(ConvertJunctionsUnitTests) + "\n")
sys.stderr.write(" " + str(ExonToGeneReadCountTests) + "\n")
+ sys.stderr.write(" " + str(TestGenomicIntJaccard) + "\n")
+ sys.stderr.write(" " + str(TestGenomicIntIntersection) + "\n")
sys.stderr.write(" " + str(PyokitSmokeTests) + "\n\n")
sys.stderr.write("\n\n RUNNING TESTS \n\n")