-
Notifications
You must be signed in to change notification settings - Fork 10.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[NFC] SuffixTree: Split out SuffixTreeNodes into their own files
Add: - SuffixTreeNode.h - SuffixTreeNode.cpp The SuffixTree file was getting too long.
- Loading branch information
Jessica Paquette
committed
May 12, 2023
1 parent
66520c0
commit 6cf993e
Showing
4 changed files
with
213 additions
and
156 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// This file defines nodes for use within a SuffixTree. | ||
// | ||
// Each node has either no children or at least two children, with the root | ||
// being a exception in the empty tree. | ||
// | ||
// Children are represented as a map between unsigned integers and nodes. If | ||
// a node N has a child M on unsigned integer k, then the mapping represented | ||
// by N is a proper prefix of the mapping represented by M. Note that this, | ||
// although similar to a trie is somewhat different: each node stores a full | ||
// substring of the full mapping rather than a single character state. | ||
// | ||
// Each internal node contains a pointer to the internal node representing | ||
// the same string, but with the first character chopped off. This is stored | ||
// in \p Link. Each leaf node stores the start index of its respective | ||
// suffix in \p SuffixIdx. | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H | ||
#define LLVM_SUPPORT_SUFFIXTREE_NODE_H | ||
#include "llvm/ADT/DenseMap.h" | ||
|
||
namespace llvm { | ||
|
||
/// A node in a suffix tree which represents a substring or suffix. | ||
struct SuffixTreeNode { | ||
public: | ||
/// Represents an undefined index in the suffix tree. | ||
static const unsigned EmptyIdx = -1; | ||
enum class NodeKind { ST_Leaf, ST_Internal }; | ||
|
||
private: | ||
const NodeKind Kind; | ||
|
||
/// The start index of this node's substring in the main string. | ||
unsigned StartIdx = EmptyIdx; | ||
|
||
/// The length of the string formed by concatenating the edge labels from | ||
/// the root to this node. | ||
unsigned ConcatLen = 0; | ||
|
||
public: | ||
// LLVM RTTI boilerplate. | ||
NodeKind getKind() const { return Kind; } | ||
|
||
/// \return the start index of this node's substring in the entire string. | ||
unsigned getStartIdx() const; | ||
|
||
/// \returns the end index of this node. | ||
virtual unsigned getEndIdx() const = 0; | ||
|
||
/// Advance this node's StartIdx by \p Inc. | ||
void incrementStartIdx(unsigned Inc); | ||
|
||
/// Set the length of the string from the root to this node to \p Len. | ||
void setConcatLen(unsigned Len); | ||
|
||
/// \returns the length of the string from the root to this node. | ||
unsigned getConcatLen() const; | ||
|
||
SuffixTreeNode(NodeKind Kind, unsigned StartIdx) | ||
: Kind(Kind), StartIdx(StartIdx) {} | ||
virtual ~SuffixTreeNode() = default; | ||
}; | ||
|
||
// A node with two or more children, or the root. | ||
struct SuffixTreeInternalNode : SuffixTreeNode { | ||
private: | ||
/// The end index of this node's substring in the main string. | ||
/// | ||
/// Every leaf node must have its \p EndIdx incremented at the end of every | ||
/// step in the construction algorithm. To avoid having to update O(N) | ||
/// nodes individually at the end of every step, the end index is stored | ||
/// as a pointer. | ||
unsigned EndIdx = EmptyIdx; | ||
|
||
/// A pointer to the internal node representing the same sequence with the | ||
/// first character chopped off. | ||
/// | ||
/// This acts as a shortcut in Ukkonen's algorithm. One of the things that | ||
/// Ukkonen's algorithm does to achieve linear-time construction is | ||
/// keep track of which node the next insert should be at. This makes each | ||
/// insert O(1), and there are a total of O(N) inserts. The suffix link | ||
/// helps with inserting children of internal nodes. | ||
/// | ||
/// Say we add a child to an internal node with associated mapping S. The | ||
/// next insertion must be at the node representing S - its first character. | ||
/// This is given by the way that we iteratively build the tree in Ukkonen's | ||
/// algorithm. The main idea is to look at the suffixes of each prefix in the | ||
/// string, starting with the longest suffix of the prefix, and ending with | ||
/// the shortest. Therefore, if we keep pointers between such nodes, we can | ||
/// move to the next insertion point in O(1) time. If we don't, then we'd | ||
/// have to query from the root, which takes O(N) time. This would make the | ||
/// construction algorithm O(N^2) rather than O(N). | ||
SuffixTreeInternalNode *Link = nullptr; | ||
|
||
public: | ||
// LLVM RTTI boilerplate. | ||
static bool classof(const SuffixTreeNode *N) { | ||
return N->getKind() == NodeKind::ST_Internal; | ||
} | ||
|
||
/// \returns true if this node is the root of its owning \p SuffixTree. | ||
bool isRoot() const; | ||
|
||
/// \returns the end index of this node's substring in the entire string. | ||
unsigned getEndIdx() const override; | ||
|
||
/// Sets \p Link to \p L. Assumes \p L is not null. | ||
void setLink(SuffixTreeInternalNode *L); | ||
|
||
/// \returns the pointer to the Link node. | ||
SuffixTreeInternalNode *getLink() const; | ||
|
||
/// The children of this node. | ||
/// | ||
/// A child existing on an unsigned integer implies that from the mapping | ||
/// represented by the current node, there is a way to reach another | ||
/// mapping by tacking that character on the end of the current string. | ||
DenseMap<unsigned, SuffixTreeNode *> Children; | ||
|
||
SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx, | ||
SuffixTreeInternalNode *Link) | ||
: SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx), | ||
Link(Link) {} | ||
|
||
virtual ~SuffixTreeInternalNode() = default; | ||
}; | ||
|
||
// A node representing a suffix. | ||
struct SuffixTreeLeafNode : SuffixTreeNode { | ||
private: | ||
/// The start index of the suffix represented by this leaf. | ||
unsigned SuffixIdx = EmptyIdx; | ||
|
||
/// The end index of this node's substring in the main string. | ||
/// | ||
/// Every leaf node must have its \p EndIdx incremented at the end of every | ||
/// step in the construction algorithm. To avoid having to update O(N) | ||
/// nodes individually at the end of every step, the end index is stored | ||
/// as a pointer. | ||
unsigned *EndIdx = nullptr; | ||
|
||
public: | ||
// LLVM RTTI boilerplate. | ||
static bool classof(const SuffixTreeNode *N) { | ||
return N->getKind() == NodeKind::ST_Leaf; | ||
} | ||
|
||
/// \returns the end index of this node's substring in the entire string. | ||
unsigned getEndIdx() const override; | ||
|
||
/// \returns the start index of the suffix represented by this leaf. | ||
unsigned getSuffixIdx() const; | ||
|
||
/// Sets the start index of the suffix represented by this leaf to \p Idx. | ||
void setSuffixIdx(unsigned Idx); | ||
SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx) | ||
: SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {} | ||
|
||
virtual ~SuffixTreeLeafNode() = default; | ||
}; | ||
} // namespace llvm | ||
#endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
//===- llvm/ADT/SuffixTreeNode.cpp - Nodes for SuffixTrees --------*- C++ | ||
//-*-===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// This file defines nodes for use within a SuffixTree. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "llvm/Support/SuffixTreeNode.h" | ||
#include "llvm/Support/Casting.h" | ||
|
||
using namespace llvm; | ||
|
||
unsigned SuffixTreeNode::getStartIdx() const { return StartIdx; } | ||
void SuffixTreeNode::incrementStartIdx(unsigned Inc) { StartIdx += Inc; } | ||
void SuffixTreeNode::setConcatLen(unsigned Len) { ConcatLen = Len; } | ||
unsigned SuffixTreeNode::getConcatLen() const { return ConcatLen; } | ||
|
||
bool SuffixTreeInternalNode::isRoot() const { | ||
return getStartIdx() == EmptyIdx; | ||
} | ||
unsigned SuffixTreeInternalNode::getEndIdx() const { return EndIdx; } | ||
void SuffixTreeInternalNode::setLink(SuffixTreeInternalNode *L) { | ||
assert(L && "Cannot set a null link?"); | ||
Link = L; | ||
} | ||
SuffixTreeInternalNode *SuffixTreeInternalNode::getLink() const { return Link; } | ||
|
||
unsigned SuffixTreeLeafNode::getEndIdx() const { | ||
assert(EndIdx && "EndIdx is empty?"); | ||
return *EndIdx; | ||
} | ||
|
||
unsigned SuffixTreeLeafNode::getSuffixIdx() const { return SuffixIdx; } | ||
void SuffixTreeLeafNode::setSuffixIdx(unsigned Idx) { SuffixIdx = Idx; } |