diff --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h index 372b83d1172a1..189dd041bffe2 100644 --- a/llvm/include/llvm/Support/SuffixTree.h +++ b/llvm/include/llvm/Support/SuffixTree.h @@ -13,167 +13,12 @@ #define LLVM_SUPPORT_SUFFIXTREE_H #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/SuffixTreeNode.h" #include namespace llvm { - -/// A node in a suffix tree which represents a substring or suffix. -/// -/// Each node has either no children or at least two children, with the root -/// being a exception in the empty tree. -/// -/// Children are represented as a map between unsigned integers and nodes. If -/// a node N has a child M on unsigned integer k, then the mapping represented -/// by N is a proper prefix of the mapping represented by M. Note that this, -/// although similar to a trie is somewhat different: each node stores a full -/// substring of the full mapping rather than a single character state. -/// -/// Each internal node contains a pointer to the internal node representing -/// the same string, but with the first character chopped off. This is stored -/// in \p Link. Each leaf node stores the start index of its respective -/// suffix in \p SuffixIdx. -struct SuffixTreeNode { -public: - /// Represents an undefined index in the suffix tree. - static const unsigned EmptyIdx = -1; - enum class NodeKind { ST_Leaf, ST_Internal }; - -private: - const NodeKind Kind; - /// The start index of this node's substring in the main string. - unsigned StartIdx = EmptyIdx; - - /// The length of the string formed by concatenating the edge labels from - /// the root to this node. - unsigned ConcatLen = 0; - -public: - NodeKind getKind() const { return Kind; } - - /// \return the start index of this node's substring in the entire string. - virtual unsigned getStartIdx() const { return StartIdx; } - - /// \returns the end index of this node. - virtual unsigned getEndIdx() const = 0; - - /// Advance this node's StartIdx by \p Inc. - void incrementStartIdx(unsigned Inc) { StartIdx += Inc; } - - /// Set the length of the string from the root to this node to \p Len. - void setConcatLen(unsigned Len) { ConcatLen = Len; } - - /// \returns the length of the string from the root to this node. - unsigned getConcatLen() const { return ConcatLen; } - - SuffixTreeNode(NodeKind Kind, unsigned StartIdx) - : Kind(Kind), StartIdx(StartIdx) {} - virtual ~SuffixTreeNode() = default; -}; - -struct SuffixTreeInternalNode : SuffixTreeNode { -private: - /// The end index of this node's substring in the main string. - /// - /// Every leaf node must have its \p EndIdx incremented at the end of every - /// step in the construction algorithm. To avoid having to update O(N) - /// nodes individually at the end of every step, the end index is stored - /// as a pointer. - unsigned EndIdx = EmptyIdx; - - /// A pointer to the internal node representing the same sequence with the - /// first character chopped off. - /// - /// This acts as a shortcut in Ukkonen's algorithm. One of the things that - /// Ukkonen's algorithm does to achieve linear-time construction is - /// keep track of which node the next insert should be at. This makes each - /// insert O(1), and there are a total of O(N) inserts. The suffix link - /// helps with inserting children of internal nodes. - /// - /// Say we add a child to an internal node with associated mapping S. The - /// next insertion must be at the node representing S - its first character. - /// This is given by the way that we iteratively build the tree in Ukkonen's - /// algorithm. The main idea is to look at the suffixes of each prefix in the - /// string, starting with the longest suffix of the prefix, and ending with - /// the shortest. Therefore, if we keep pointers between such nodes, we can - /// move to the next insertion point in O(1) time. If we don't, then we'd - /// have to query from the root, which takes O(N) time. This would make the - /// construction algorithm O(N^2) rather than O(N). - SuffixTreeInternalNode *Link = nullptr; - -public: - static bool classof(const SuffixTreeNode *N) { - return N->getKind() == NodeKind::ST_Internal; - } - - /// \returns true if this node is the root of its owning \p SuffixTree. - bool isRoot() const { return getStartIdx() == EmptyIdx; } - - /// \returns the end index of this node's substring in the entire string. - unsigned getEndIdx() const override { return EndIdx; } - - /// Sets \p Link to \p L. Assumes \p L is not null. - void setLink(SuffixTreeInternalNode *L) { - assert(L && "Cannot set a null link?"); - Link = L; - } - - /// \returns the pointer to the Link node. - SuffixTreeInternalNode *getLink() const { - return Link; - } - - /// The children of this node. - /// - /// A child existing on an unsigned integer implies that from the mapping - /// represented by the current node, there is a way to reach another - /// mapping by tacking that character on the end of the current string. - DenseMap Children; - - SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx, - SuffixTreeInternalNode *Link) - : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx), - Link(Link) {} - - virtual ~SuffixTreeInternalNode() = default; -}; - -struct SuffixTreeLeafNode : SuffixTreeNode { -private: - /// The start index of the suffix represented by this leaf. - unsigned SuffixIdx = EmptyIdx; - - /// The end index of this node's substring in the main string. - /// - /// Every leaf node must have its \p EndIdx incremented at the end of every - /// step in the construction algorithm. To avoid having to update O(N) - /// nodes individually at the end of every step, the end index is stored - /// as a pointer. - unsigned *EndIdx = nullptr; - -public: - static bool classof(const SuffixTreeNode *N) { - return N->getKind() == NodeKind::ST_Leaf; - } - - /// \returns the end index of this node's substring in the entire string. - unsigned getEndIdx() const override { - assert(EndIdx && "EndIdx is empty?"); - return *EndIdx; - } - - /// \returns the start index of the suffix represented by this leaf. - unsigned getSuffixIdx() const { return SuffixIdx; } - /// Sets the start index of the suffix represented by this leaf to \p Idx. - void setSuffixIdx(unsigned Idx) { SuffixIdx = Idx; } - SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx) - : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {} - - virtual ~SuffixTreeLeafNode() = default; -}; - /// A data structure for fast substring queries. /// /// Suffix trees represent the suffixes of their input strings in their leaves. diff --git a/llvm/include/llvm/Support/SuffixTreeNode.h b/llvm/include/llvm/Support/SuffixTreeNode.h new file mode 100644 index 0000000000000..7d0d1cf0c58b9 --- /dev/null +++ b/llvm/include/llvm/Support/SuffixTreeNode.h @@ -0,0 +1,171 @@ +//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines nodes for use within a SuffixTree. +// +// Each node has either no children or at least two children, with the root +// being a exception in the empty tree. +// +// Children are represented as a map between unsigned integers and nodes. If +// a node N has a child M on unsigned integer k, then the mapping represented +// by N is a proper prefix of the mapping represented by M. Note that this, +// although similar to a trie is somewhat different: each node stores a full +// substring of the full mapping rather than a single character state. +// +// Each internal node contains a pointer to the internal node representing +// the same string, but with the first character chopped off. This is stored +// in \p Link. Each leaf node stores the start index of its respective +// suffix in \p SuffixIdx. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H +#define LLVM_SUPPORT_SUFFIXTREE_NODE_H +#include "llvm/ADT/DenseMap.h" + +namespace llvm { + +/// A node in a suffix tree which represents a substring or suffix. +struct SuffixTreeNode { +public: + /// Represents an undefined index in the suffix tree. + static const unsigned EmptyIdx = -1; + enum class NodeKind { ST_Leaf, ST_Internal }; + +private: + const NodeKind Kind; + + /// The start index of this node's substring in the main string. + unsigned StartIdx = EmptyIdx; + + /// The length of the string formed by concatenating the edge labels from + /// the root to this node. + unsigned ConcatLen = 0; + +public: + // LLVM RTTI boilerplate. + NodeKind getKind() const { return Kind; } + + /// \return the start index of this node's substring in the entire string. + unsigned getStartIdx() const; + + /// \returns the end index of this node. + virtual unsigned getEndIdx() const = 0; + + /// Advance this node's StartIdx by \p Inc. + void incrementStartIdx(unsigned Inc); + + /// Set the length of the string from the root to this node to \p Len. + void setConcatLen(unsigned Len); + + /// \returns the length of the string from the root to this node. + unsigned getConcatLen() const; + + SuffixTreeNode(NodeKind Kind, unsigned StartIdx) + : Kind(Kind), StartIdx(StartIdx) {} + virtual ~SuffixTreeNode() = default; +}; + +// A node with two or more children, or the root. +struct SuffixTreeInternalNode : SuffixTreeNode { +private: + /// The end index of this node's substring in the main string. + /// + /// Every leaf node must have its \p EndIdx incremented at the end of every + /// step in the construction algorithm. To avoid having to update O(N) + /// nodes individually at the end of every step, the end index is stored + /// as a pointer. + unsigned EndIdx = EmptyIdx; + + /// A pointer to the internal node representing the same sequence with the + /// first character chopped off. + /// + /// This acts as a shortcut in Ukkonen's algorithm. One of the things that + /// Ukkonen's algorithm does to achieve linear-time construction is + /// keep track of which node the next insert should be at. This makes each + /// insert O(1), and there are a total of O(N) inserts. The suffix link + /// helps with inserting children of internal nodes. + /// + /// Say we add a child to an internal node with associated mapping S. The + /// next insertion must be at the node representing S - its first character. + /// This is given by the way that we iteratively build the tree in Ukkonen's + /// algorithm. The main idea is to look at the suffixes of each prefix in the + /// string, starting with the longest suffix of the prefix, and ending with + /// the shortest. Therefore, if we keep pointers between such nodes, we can + /// move to the next insertion point in O(1) time. If we don't, then we'd + /// have to query from the root, which takes O(N) time. This would make the + /// construction algorithm O(N^2) rather than O(N). + SuffixTreeInternalNode *Link = nullptr; + +public: + // LLVM RTTI boilerplate. + static bool classof(const SuffixTreeNode *N) { + return N->getKind() == NodeKind::ST_Internal; + } + + /// \returns true if this node is the root of its owning \p SuffixTree. + bool isRoot() const; + + /// \returns the end index of this node's substring in the entire string. + unsigned getEndIdx() const override; + + /// Sets \p Link to \p L. Assumes \p L is not null. + void setLink(SuffixTreeInternalNode *L); + + /// \returns the pointer to the Link node. + SuffixTreeInternalNode *getLink() const; + + /// The children of this node. + /// + /// A child existing on an unsigned integer implies that from the mapping + /// represented by the current node, there is a way to reach another + /// mapping by tacking that character on the end of the current string. + DenseMap Children; + + SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx, + SuffixTreeInternalNode *Link) + : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx), + Link(Link) {} + + virtual ~SuffixTreeInternalNode() = default; +}; + +// A node representing a suffix. +struct SuffixTreeLeafNode : SuffixTreeNode { +private: + /// The start index of the suffix represented by this leaf. + unsigned SuffixIdx = EmptyIdx; + + /// The end index of this node's substring in the main string. + /// + /// Every leaf node must have its \p EndIdx incremented at the end of every + /// step in the construction algorithm. To avoid having to update O(N) + /// nodes individually at the end of every step, the end index is stored + /// as a pointer. + unsigned *EndIdx = nullptr; + +public: + // LLVM RTTI boilerplate. + static bool classof(const SuffixTreeNode *N) { + return N->getKind() == NodeKind::ST_Leaf; + } + + /// \returns the end index of this node's substring in the entire string. + unsigned getEndIdx() const override; + + /// \returns the start index of the suffix represented by this leaf. + unsigned getSuffixIdx() const; + + /// Sets the start index of the suffix represented by this leaf to \p Idx. + void setSuffixIdx(unsigned Idx); + SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx) + : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {} + + virtual ~SuffixTreeLeafNode() = default; +}; +} // namespace llvm +#endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H \ No newline at end of file diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index b000c436670e1..714f9ba0214f5 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -217,6 +217,7 @@ add_llvm_component_library(LLVMSupport StringMap.cpp StringSaver.cpp StringRef.cpp + SuffixTreeNode.cpp SuffixTree.cpp SystemUtils.cpp TarWriter.cpp diff --git a/llvm/lib/Support/SuffixTreeNode.cpp b/llvm/lib/Support/SuffixTreeNode.cpp new file mode 100644 index 0000000000000..113b990fd352f --- /dev/null +++ b/llvm/lib/Support/SuffixTreeNode.cpp @@ -0,0 +1,40 @@ +//===- llvm/ADT/SuffixTreeNode.cpp - Nodes for SuffixTrees --------*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines nodes for use within a SuffixTree. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/SuffixTreeNode.h" +#include "llvm/Support/Casting.h" + +using namespace llvm; + +unsigned SuffixTreeNode::getStartIdx() const { return StartIdx; } +void SuffixTreeNode::incrementStartIdx(unsigned Inc) { StartIdx += Inc; } +void SuffixTreeNode::setConcatLen(unsigned Len) { ConcatLen = Len; } +unsigned SuffixTreeNode::getConcatLen() const { return ConcatLen; } + +bool SuffixTreeInternalNode::isRoot() const { + return getStartIdx() == EmptyIdx; +} +unsigned SuffixTreeInternalNode::getEndIdx() const { return EndIdx; } +void SuffixTreeInternalNode::setLink(SuffixTreeInternalNode *L) { + assert(L && "Cannot set a null link?"); + Link = L; +} +SuffixTreeInternalNode *SuffixTreeInternalNode::getLink() const { return Link; } + +unsigned SuffixTreeLeafNode::getEndIdx() const { + assert(EndIdx && "EndIdx is empty?"); + return *EndIdx; +} + +unsigned SuffixTreeLeafNode::getSuffixIdx() const { return SuffixIdx; } +void SuffixTreeLeafNode::setSuffixIdx(unsigned Idx) { SuffixIdx = Idx; }