Skip to content

Commit

Permalink
[NFC] SuffixTree: Split out SuffixTreeNodes into their own files
Browse files Browse the repository at this point in the history
Add:

- SuffixTreeNode.h
- SuffixTreeNode.cpp

The SuffixTree file was getting too long.
  • Loading branch information
Jessica Paquette committed May 12, 2023
1 parent 66520c0 commit 6cf993e
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 156 deletions.
157 changes: 1 addition & 156 deletions llvm/include/llvm/Support/SuffixTree.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,167 +13,12 @@
#define LLVM_SUPPORT_SUFFIXTREE_H

#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/SuffixTreeNode.h"
#include <vector>

namespace llvm {

/// A node in a suffix tree which represents a substring or suffix.
///
/// Each node has either no children or at least two children, with the root
/// being a exception in the empty tree.
///
/// Children are represented as a map between unsigned integers and nodes. If
/// a node N has a child M on unsigned integer k, then the mapping represented
/// by N is a proper prefix of the mapping represented by M. Note that this,
/// although similar to a trie is somewhat different: each node stores a full
/// substring of the full mapping rather than a single character state.
///
/// Each internal node contains a pointer to the internal node representing
/// the same string, but with the first character chopped off. This is stored
/// in \p Link. Each leaf node stores the start index of its respective
/// suffix in \p SuffixIdx.
struct SuffixTreeNode {
public:
/// Represents an undefined index in the suffix tree.
static const unsigned EmptyIdx = -1;
enum class NodeKind { ST_Leaf, ST_Internal };

private:
const NodeKind Kind;
/// The start index of this node's substring in the main string.
unsigned StartIdx = EmptyIdx;

/// The length of the string formed by concatenating the edge labels from
/// the root to this node.
unsigned ConcatLen = 0;

public:
NodeKind getKind() const { return Kind; }

/// \return the start index of this node's substring in the entire string.
virtual unsigned getStartIdx() const { return StartIdx; }

/// \returns the end index of this node.
virtual unsigned getEndIdx() const = 0;

/// Advance this node's StartIdx by \p Inc.
void incrementStartIdx(unsigned Inc) { StartIdx += Inc; }

/// Set the length of the string from the root to this node to \p Len.
void setConcatLen(unsigned Len) { ConcatLen = Len; }

/// \returns the length of the string from the root to this node.
unsigned getConcatLen() const { return ConcatLen; }

SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
: Kind(Kind), StartIdx(StartIdx) {}
virtual ~SuffixTreeNode() = default;
};

struct SuffixTreeInternalNode : SuffixTreeNode {
private:
/// The end index of this node's substring in the main string.
///
/// Every leaf node must have its \p EndIdx incremented at the end of every
/// step in the construction algorithm. To avoid having to update O(N)
/// nodes individually at the end of every step, the end index is stored
/// as a pointer.
unsigned EndIdx = EmptyIdx;

/// A pointer to the internal node representing the same sequence with the
/// first character chopped off.
///
/// This acts as a shortcut in Ukkonen's algorithm. One of the things that
/// Ukkonen's algorithm does to achieve linear-time construction is
/// keep track of which node the next insert should be at. This makes each
/// insert O(1), and there are a total of O(N) inserts. The suffix link
/// helps with inserting children of internal nodes.
///
/// Say we add a child to an internal node with associated mapping S. The
/// next insertion must be at the node representing S - its first character.
/// This is given by the way that we iteratively build the tree in Ukkonen's
/// algorithm. The main idea is to look at the suffixes of each prefix in the
/// string, starting with the longest suffix of the prefix, and ending with
/// the shortest. Therefore, if we keep pointers between such nodes, we can
/// move to the next insertion point in O(1) time. If we don't, then we'd
/// have to query from the root, which takes O(N) time. This would make the
/// construction algorithm O(N^2) rather than O(N).
SuffixTreeInternalNode *Link = nullptr;

public:
static bool classof(const SuffixTreeNode *N) {
return N->getKind() == NodeKind::ST_Internal;
}

/// \returns true if this node is the root of its owning \p SuffixTree.
bool isRoot() const { return getStartIdx() == EmptyIdx; }

/// \returns the end index of this node's substring in the entire string.
unsigned getEndIdx() const override { return EndIdx; }

/// Sets \p Link to \p L. Assumes \p L is not null.
void setLink(SuffixTreeInternalNode *L) {
assert(L && "Cannot set a null link?");
Link = L;
}

/// \returns the pointer to the Link node.
SuffixTreeInternalNode *getLink() const {
return Link;
}

/// The children of this node.
///
/// A child existing on an unsigned integer implies that from the mapping
/// represented by the current node, there is a way to reach another
/// mapping by tacking that character on the end of the current string.
DenseMap<unsigned, SuffixTreeNode *> Children;

SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
SuffixTreeInternalNode *Link)
: SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
Link(Link) {}

virtual ~SuffixTreeInternalNode() = default;
};

struct SuffixTreeLeafNode : SuffixTreeNode {
private:
/// The start index of the suffix represented by this leaf.
unsigned SuffixIdx = EmptyIdx;

/// The end index of this node's substring in the main string.
///
/// Every leaf node must have its \p EndIdx incremented at the end of every
/// step in the construction algorithm. To avoid having to update O(N)
/// nodes individually at the end of every step, the end index is stored
/// as a pointer.
unsigned *EndIdx = nullptr;

public:
static bool classof(const SuffixTreeNode *N) {
return N->getKind() == NodeKind::ST_Leaf;
}

/// \returns the end index of this node's substring in the entire string.
unsigned getEndIdx() const override {
assert(EndIdx && "EndIdx is empty?");
return *EndIdx;
}

/// \returns the start index of the suffix represented by this leaf.
unsigned getSuffixIdx() const { return SuffixIdx; }
/// Sets the start index of the suffix represented by this leaf to \p Idx.
void setSuffixIdx(unsigned Idx) { SuffixIdx = Idx; }
SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
: SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}

virtual ~SuffixTreeLeafNode() = default;
};

/// A data structure for fast substring queries.
///
/// Suffix trees represent the suffixes of their input strings in their leaves.
Expand Down
171 changes: 171 additions & 0 deletions llvm/include/llvm/Support/SuffixTreeNode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines nodes for use within a SuffixTree.
//
// Each node has either no children or at least two children, with the root
// being a exception in the empty tree.
//
// Children are represented as a map between unsigned integers and nodes. If
// a node N has a child M on unsigned integer k, then the mapping represented
// by N is a proper prefix of the mapping represented by M. Note that this,
// although similar to a trie is somewhat different: each node stores a full
// substring of the full mapping rather than a single character state.
//
// Each internal node contains a pointer to the internal node representing
// the same string, but with the first character chopped off. This is stored
// in \p Link. Each leaf node stores the start index of its respective
// suffix in \p SuffixIdx.
//===----------------------------------------------------------------------===//

#ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H
#define LLVM_SUPPORT_SUFFIXTREE_NODE_H
#include "llvm/ADT/DenseMap.h"

namespace llvm {

/// A node in a suffix tree which represents a substring or suffix.
struct SuffixTreeNode {
public:
/// Represents an undefined index in the suffix tree.
static const unsigned EmptyIdx = -1;
enum class NodeKind { ST_Leaf, ST_Internal };

private:
const NodeKind Kind;

/// The start index of this node's substring in the main string.
unsigned StartIdx = EmptyIdx;

/// The length of the string formed by concatenating the edge labels from
/// the root to this node.
unsigned ConcatLen = 0;

public:
// LLVM RTTI boilerplate.
NodeKind getKind() const { return Kind; }

/// \return the start index of this node's substring in the entire string.
unsigned getStartIdx() const;

/// \returns the end index of this node.
virtual unsigned getEndIdx() const = 0;

/// Advance this node's StartIdx by \p Inc.
void incrementStartIdx(unsigned Inc);

/// Set the length of the string from the root to this node to \p Len.
void setConcatLen(unsigned Len);

/// \returns the length of the string from the root to this node.
unsigned getConcatLen() const;

SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
: Kind(Kind), StartIdx(StartIdx) {}
virtual ~SuffixTreeNode() = default;
};

// A node with two or more children, or the root.
struct SuffixTreeInternalNode : SuffixTreeNode {
private:
/// The end index of this node's substring in the main string.
///
/// Every leaf node must have its \p EndIdx incremented at the end of every
/// step in the construction algorithm. To avoid having to update O(N)
/// nodes individually at the end of every step, the end index is stored
/// as a pointer.
unsigned EndIdx = EmptyIdx;

/// A pointer to the internal node representing the same sequence with the
/// first character chopped off.
///
/// This acts as a shortcut in Ukkonen's algorithm. One of the things that
/// Ukkonen's algorithm does to achieve linear-time construction is
/// keep track of which node the next insert should be at. This makes each
/// insert O(1), and there are a total of O(N) inserts. The suffix link
/// helps with inserting children of internal nodes.
///
/// Say we add a child to an internal node with associated mapping S. The
/// next insertion must be at the node representing S - its first character.
/// This is given by the way that we iteratively build the tree in Ukkonen's
/// algorithm. The main idea is to look at the suffixes of each prefix in the
/// string, starting with the longest suffix of the prefix, and ending with
/// the shortest. Therefore, if we keep pointers between such nodes, we can
/// move to the next insertion point in O(1) time. If we don't, then we'd
/// have to query from the root, which takes O(N) time. This would make the
/// construction algorithm O(N^2) rather than O(N).
SuffixTreeInternalNode *Link = nullptr;

public:
// LLVM RTTI boilerplate.
static bool classof(const SuffixTreeNode *N) {
return N->getKind() == NodeKind::ST_Internal;
}

/// \returns true if this node is the root of its owning \p SuffixTree.
bool isRoot() const;

/// \returns the end index of this node's substring in the entire string.
unsigned getEndIdx() const override;

/// Sets \p Link to \p L. Assumes \p L is not null.
void setLink(SuffixTreeInternalNode *L);

/// \returns the pointer to the Link node.
SuffixTreeInternalNode *getLink() const;

/// The children of this node.
///
/// A child existing on an unsigned integer implies that from the mapping
/// represented by the current node, there is a way to reach another
/// mapping by tacking that character on the end of the current string.
DenseMap<unsigned, SuffixTreeNode *> Children;

SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
SuffixTreeInternalNode *Link)
: SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
Link(Link) {}

virtual ~SuffixTreeInternalNode() = default;
};

// A node representing a suffix.
struct SuffixTreeLeafNode : SuffixTreeNode {
private:
/// The start index of the suffix represented by this leaf.
unsigned SuffixIdx = EmptyIdx;

/// The end index of this node's substring in the main string.
///
/// Every leaf node must have its \p EndIdx incremented at the end of every
/// step in the construction algorithm. To avoid having to update O(N)
/// nodes individually at the end of every step, the end index is stored
/// as a pointer.
unsigned *EndIdx = nullptr;

public:
// LLVM RTTI boilerplate.
static bool classof(const SuffixTreeNode *N) {
return N->getKind() == NodeKind::ST_Leaf;
}

/// \returns the end index of this node's substring in the entire string.
unsigned getEndIdx() const override;

/// \returns the start index of the suffix represented by this leaf.
unsigned getSuffixIdx() const;

/// Sets the start index of the suffix represented by this leaf to \p Idx.
void setSuffixIdx(unsigned Idx);
SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
: SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}

virtual ~SuffixTreeLeafNode() = default;
};
} // namespace llvm
#endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H
1 change: 1 addition & 0 deletions llvm/lib/Support/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ add_llvm_component_library(LLVMSupport
StringMap.cpp
StringSaver.cpp
StringRef.cpp
SuffixTreeNode.cpp
SuffixTree.cpp
SystemUtils.cpp
TarWriter.cpp
Expand Down
40 changes: 40 additions & 0 deletions llvm/lib/Support/SuffixTreeNode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
//===- llvm/ADT/SuffixTreeNode.cpp - Nodes for SuffixTrees --------*- C++
//-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines nodes for use within a SuffixTree.
//
//===----------------------------------------------------------------------===//

#include "llvm/Support/SuffixTreeNode.h"
#include "llvm/Support/Casting.h"

using namespace llvm;

unsigned SuffixTreeNode::getStartIdx() const { return StartIdx; }
void SuffixTreeNode::incrementStartIdx(unsigned Inc) { StartIdx += Inc; }
void SuffixTreeNode::setConcatLen(unsigned Len) { ConcatLen = Len; }
unsigned SuffixTreeNode::getConcatLen() const { return ConcatLen; }

bool SuffixTreeInternalNode::isRoot() const {
return getStartIdx() == EmptyIdx;
}
unsigned SuffixTreeInternalNode::getEndIdx() const { return EndIdx; }
void SuffixTreeInternalNode::setLink(SuffixTreeInternalNode *L) {
assert(L && "Cannot set a null link?");
Link = L;
}
SuffixTreeInternalNode *SuffixTreeInternalNode::getLink() const { return Link; }

unsigned SuffixTreeLeafNode::getEndIdx() const {
assert(EndIdx && "EndIdx is empty?");
return *EndIdx;
}

unsigned SuffixTreeLeafNode::getSuffixIdx() const { return SuffixIdx; }
void SuffixTreeLeafNode::setSuffixIdx(unsigned Idx) { SuffixIdx = Idx; }

0 comments on commit 6cf993e

Please sign in to comment.