Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Universal B tree implementation #746

Merged
merged 14 commits into from Aug 29, 2016
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions src/mlpack/core/tree/CMakeLists.txt
Expand Up @@ -15,6 +15,7 @@ set(SOURCES
binary_space_tree/mean_split_impl.hpp
binary_space_tree/midpoint_split.hpp
binary_space_tree/midpoint_split_impl.hpp
binary_space_tree/perform_split.hpp
binary_space_tree/rp_tree_max_split.hpp
binary_space_tree/rp_tree_max_split_impl.hpp
binary_space_tree/rp_tree_mean_split.hpp
Expand Down
30 changes: 21 additions & 9 deletions src/mlpack/core/tree/address.hpp
Expand Up @@ -37,6 +37,16 @@ namespace addr {
* variables should be equal-sized and the type of the address should correspond
* to the type of the vector.
*
* The function maps each floating point coordinate to an equal-sized unsigned
* integer datatype in such a way that the transform preserves the ordering
* (i.e. lower coordinates correspond to lower integers). Thus, the mapping
* saves the exponent and the mantissa of each floating point value
* consequently, furthermore the exponent is stored before the mantissa. In the
* case of negative numbers the resulting integer value should be inverted.
* In the multi-dimensional case, after we transform the representation, we
* have to interleave the bits of the new representation across all the elements
* in the address vector.
*
* @param address The resulting address.
* @param point The point that is being translated to the address.
*/
Expand Down Expand Up @@ -122,6 +132,8 @@ void PointToAddress(AddressType& address, const VecType& point)
* variables should be equal-sized and the type of the address should correspond
* to the type of the vector.
*
* The function makes the backward transform to the function above.
*
* @param address An address to translate.
* @param point The point that corresponds to the address.
*/
Expand Down Expand Up @@ -201,9 +213,9 @@ void AddressToPoint(VecType& point, const AddressType& address)
template<typename AddressType1, typename AddressType2>
int CompareAddresses(const AddressType1& addr1, const AddressType2& addr2)
{
static_assert(sizeof(typename AddressType1::elem_type) ==
sizeof(typename AddressType2::elem_type), "We aren't able to compare "
"adresses of distinct sizes");
static_assert(std::is_same<typename AddressType1::elem_type,
typename AddressType2::elem_type>::value == true, "We aren't able to "
"compare adresses of distinct types");

assert(addr1.n_elem == addr2.n_elem);

Expand All @@ -225,13 +237,13 @@ template<typename AddressType1, typename AddressType2, typename AddressType3>
bool Contains(const AddressType1& address, const AddressType2& loBound,
const AddressType3& hiBound)
{
static_assert(sizeof(typename AddressType1::elem_type) ==
sizeof(typename AddressType2::elem_type), "We aren't able to compare "
"adresses of distinct sizes");
static_assert(std::is_same<typename AddressType1::elem_type,
typename AddressType2::elem_type>::value == true, "We aren't able to "
"compare adresses of distinct types");

static_assert(sizeof(typename AddressType1::elem_type) ==
sizeof(typename AddressType3::elem_type), "We aren't able to compare "
"adresses of distinct sizes");
static_assert(std::is_same<typename AddressType1::elem_type,
typename AddressType3::elem_type>::value == true, "We aren't able to "
"compare adresses of distinct types");

assert(address.n_elem == loBound.n_elem);
assert(address.n_elem == hiBound.n_elem);
Expand Down
80 changes: 0 additions & 80 deletions src/mlpack/core/tree/binary_space_tree/binary_space_tree.hpp
Expand Up @@ -507,86 +507,6 @@ class BinarySpaceTree
const size_t maxLeafSize,
SplitType<BoundType<MetricType>, MatType>& splitter);

/**
* Perform the split process according to the information about the
* split.
*
* @param bound The bound used for this node.
* @param data The dataset used by the binary space tree.
* @param begin Index of the starting point in the dataset that belongs to
* this node.
* @param count Number of points in this node.
* @param splitInfo The information about the split.
*/
template<typename SplitInfo>
size_t PerformSplit(MatType& data,
const size_t begin,
const size_t count,
const SplitInfo& splitInfo);

/**
* Perform the split process according to the information about the split and
* return a list of changed indices.
*
* @param bound The bound used for this node.
* @param data The dataset used by the binary space tree.
* @param begin Index of the starting point in the dataset that belongs to
* this node.
* @param count Number of points in this node.
* @param splitInfo The information about the split.
* @param oldFromNew Vector which will be filled with the old positions for
* each new point.
*/
template<typename SplitInfo>
size_t PerformSplit(MatType& data,
const size_t begin,
const size_t count,
const SplitInfo& splitInfo,
std::vector<size_t>& oldFromNew);

/**
* An overload for the universal B tree. For the first time the function
* rearranges the whole dataset. Next time the function only returns the split
* column.
*
* @param bound The bound used for this node.
* @param data The dataset used by the binary space tree.
* @param begin Index of the starting point in the dataset that belongs to
* this node.
* @param count Number of points in this node.
* @param splitInfo The information about the split.
* @param oldFromNew Vector which will be filled with the old positions for
* each new point.
*/
size_t PerformSplit(
MatType& data,
const size_t begin,
const size_t count,
const typename UBTreeSplit<BoundType<MetricType>,
MatType>::SplitInfo& splitInfo);

/**
* An overload for the universal B tree. For the first time the function
* rearranges the whole dataset. Next time the function only returns the split
* column.
*
* @param bound The bound used for this node.
* @param data The dataset used by the binary space tree.
* @param begin Index of the starting point in the dataset that belongs to
* this node.
* @param count Number of points in this node.
* @param splitInfo The information about the split.
* @param oldFromNew Vector which will be filled with the old positions for
* each new point.
*/
size_t PerformSplit(
MatType& data,
const size_t begin,
const size_t count,
const typename UBTreeSplit<BoundType<MetricType>,
MatType>::SplitInfo& splitInfo,
std::vector<size_t>& oldFromNew);

/**
* Update the bound of the current node. This method does not take into
* account bound-specific properties.
Expand Down
156 changes: 3 additions & 153 deletions src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp
Expand Up @@ -765,7 +765,7 @@ void BinarySpaceTree<MetricType, StatisticType, MatType, BoundType, SplitType>::
// Perform the actual splitting. This will order the dataset such that
// points that belong to the left subtree are on the left of splitCol, and
// points from the right subtree are on the right side of splitCol.
splitCol = PerformSplit(*dataset, begin, count, splitInfo);
splitCol = splitter.PerformSplit(*dataset, begin, count, splitInfo);

assert(splitCol > begin);
assert(splitCol < begin + count);
Expand Down Expand Up @@ -831,7 +831,8 @@ SplitNode(std::vector<size_t>& oldFromNew,
// Perform the actual splitting. This will order the dataset such that
// points that belong to the left subtree are on the left of splitCol, and
// points from the right subtree are on the right side of splitCol.
splitCol = PerformSplit(*dataset, begin, count, splitInfo, oldFromNew);
splitCol = splitter.PerformSplit(*dataset, begin, count, splitInfo,
oldFromNew);

assert(splitCol > begin);
assert(splitCol < begin + count);
Expand All @@ -857,157 +858,6 @@ SplitNode(std::vector<size_t>& oldFromNew,
right->ParentDistance() = rightParentDistance;
}

template<typename MetricType,
typename StatisticType,
typename MatType,
template<typename BoundMetricType, typename...> class BoundType,
template<typename SplitBoundType, typename SplitMatType>
class SplitType>
template<typename SplitInfo>
size_t BinarySpaceTree<MetricType, StatisticType, MatType, BoundType,
SplitType>::PerformSplit(MatType& data,
const size_t begin,
const size_t count,
const SplitInfo& splitInfo)
{
// This method modifies the input dataset. We loop both from the left and
// right sides of the points contained in this node. The points less than
// splitVal should be on the left side of the matrix, and the points greater
// than splitVal should be on the right side of the matrix.
size_t left = begin;
size_t right = begin + count - 1;

// First half-iteration of the loop is out here because the termination
// condition is in the middle.
while (Split::AssignToLeftNode(data.col(left), splitInfo) && (left <= right))
left++;
while ((!Split::AssignToLeftNode(data.col(right), splitInfo)) &&
(left <= right) && (right > 0))
right--;

while (left <= right)
{
// Swap columns.
data.swap_cols(left, right);

// See how many points on the left are correct. When they are correct,
// increase the left counter accordingly. When we encounter one that isn't
// correct, stop. We will switch it later.
while (Split::AssignToLeftNode(data.col(left), splitInfo) &&
(left <= right))
left++;

// Now see how many points on the right are correct. When they are correct,
// decrease the right counter accordingly. When we encounter one that isn't
// correct, stop. We will switch it with the wrong point we found in the
// previous loop.
while ((!Split::AssignToLeftNode(data.col(right), splitInfo)) &&
(left <= right))
right--;
}

Log::Assert(left == right + 1);

return left;
}

template<typename MetricType,
typename StatisticType,
typename MatType,
template<typename BoundMetricType, typename...> class BoundType,
template<typename SplitBoundType, typename SplitMatType>
class SplitType>
template<typename SplitInfo>
size_t BinarySpaceTree<MetricType, StatisticType, MatType, BoundType,
SplitType>::PerformSplit(MatType& data,
const size_t begin,
const size_t count,
const SplitInfo& splitInfo,
std::vector<size_t>& oldFromNew)
{
// This method modifies the input dataset. We loop both from the left and
// right sides of the points contained in this node. The points less than
// splitVal should be on the left side of the matrix, and the points greater
// than splitVal should be on the right side of the matrix.
size_t left = begin;
size_t right = begin + count - 1;

// First half-iteration of the loop is out here because the termination
// condition is in the middle.
while (Split::AssignToLeftNode(data.col(left), splitInfo) && (left <= right))
left++;
while ((!Split::AssignToLeftNode(data.col(right), splitInfo)) &&
(left <= right) && (right > 0))
right--;

while (left <= right)
{
// Swap columns.
data.swap_cols(left, right);

// Update the indices for what we changed.
size_t t = oldFromNew[left];
oldFromNew[left] = oldFromNew[right];
oldFromNew[right] = t;

// See how many points on the left are correct. When they are correct,
// increase the left counter accordingly. When we encounter one that isn't
// correct, stop. We will switch it later.
while (Split::AssignToLeftNode(data.col(left), splitInfo) &&
(left <= right))
left++;

// Now see how many points on the right are correct. When they are correct,
// decrease the right counter accordingly. When we encounter one that isn't
// correct, stop. We will switch it with the wrong point we found in the
// previous loop.
while ((!Split::AssignToLeftNode(data.col(right), splitInfo)) &&
(left <= right))
right--;
}

Log::Assert(left == right + 1);

return left;
}

template<typename MetricType,
typename StatisticType,
typename MatType,
template<typename BoundMetricType, typename...> class BoundType,
template<typename SplitBoundType, typename SplitMatType>
class SplitType>
size_t BinarySpaceTree<MetricType, StatisticType, MatType, BoundType,
SplitType>::PerformSplit(
MatType& data,
const size_t begin,
const size_t count,
const typename UBTreeSplit<BoundType<MetricType>,
MatType>::SplitInfo& splitInfo)
{
return SplitType<BoundType<MetricType>, MatType>::PerformSplit(data, begin,
count, splitInfo);
}

template<typename MetricType,
typename StatisticType,
typename MatType,
template<typename BoundMetricType, typename...> class BoundType,
template<typename SplitBoundType, typename SplitMatType>
class SplitType>
size_t BinarySpaceTree<MetricType, StatisticType, MatType, BoundType,
SplitType>::PerformSplit(
MatType& data,
const size_t begin,
const size_t count,
const typename UBTreeSplit<BoundType<MetricType>,
MatType>::SplitInfo& splitInfo,
std::vector<size_t>& oldFromNew)
{
return SplitType<BoundType<MetricType>, MatType>::PerformSplit(data, begin,
count, splitInfo, oldFromNew);
}

template<typename MetricType,
typename StatisticType,
typename MatType,
Expand Down