Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

R+ and R++ trees implementation #699

Merged
merged 12 commits into from Jul 7, 2016
15 changes: 15 additions & 0 deletions src/mlpack/core/tree/hrectbound.hpp
Expand Up @@ -182,6 +182,21 @@ class HRectBound
template<typename VecType>
bool Contains(const VecType& point) const;

/**
* Determines if this bound partially contains a bound.
*/
bool Contains(const HRectBound& bound) const;

/**
* Returns the intersection of this bound and another.
*/
HRectBound Intersect(const HRectBound& bound) const;

/**
* Returns the volume of overlap of this bound and another.
*/
ElemType Overlap(const HRectBound& bound) const;

/**
* Returns the diameter of the hyperrectangle (that is, the longest diagonal).
*/
Expand Down
54 changes: 54 additions & 0 deletions src/mlpack/core/tree/hrectbound_impl.hpp
Expand Up @@ -143,7 +143,12 @@ inline ElemType HRectBound<MetricType, ElemType>::Volume() const
{
ElemType volume = 1.0;
for (size_t i = 0; i < dim; ++i)
{
if (bounds[i].Lo() >= bounds[i].Hi())
return 0;

volume *= (bounds[i].Hi() - bounds[i].Lo());
}

return volume;
}
Expand Down Expand Up @@ -430,6 +435,55 @@ inline bool HRectBound<MetricType, ElemType>::Contains(const VecType& point) con
return true;
}

template<typename MetricType, typename ElemType>
inline bool HRectBound<MetricType, ElemType>::Contains(
const HRectBound& bound) const
{
for (size_t i = 0; i < dim; i++)
{
const math::RangeType<ElemType>& r_a = bounds[i];
const math::RangeType<ElemType>& r_b = bound.bounds[i];

if (r_a.Hi() <= r_b.Lo() || r_a.Lo() >= r_b.Hi()) // If a does not overlap b at all.
return false;
}

return true;
}

template<typename MetricType, typename ElemType>
inline HRectBound<MetricType, ElemType> HRectBound<MetricType, ElemType>::
Intersect(const HRectBound& bound) const
{
HRectBound<MetricType, ElemType> result(dim);

for (size_t k = 0; k < dim; k++)
{
result[k].Lo() = std::max(bounds[k].Lo(), bound.bounds[k].Lo());
result[k].Hi() = std::min(bounds[k].Hi(), bound.bounds[k].Hi());
}
return result;
}

template<typename MetricType, typename ElemType>
inline ElemType HRectBound<MetricType, ElemType>::Overlap(
const HRectBound& bound) const
{
ElemType volume = 1.0;

for (size_t k = 0; k < dim; k++)
{
ElemType lo = std::max(bounds[k].Lo(), bound.bounds[k].Lo());
ElemType hi = std::min(bounds[k].Hi(), bound.bounds[k].Hi());

if ( hi <= lo)
return 0;

volume *= hi - lo;
}
return volume;
}

/**
* Returns the diameter of the hyperrectangle (that is, the longest diagonal).
*/
Expand Down
29 changes: 4 additions & 25 deletions src/mlpack/core/tree/rectangle_tree/minimal_coverage_sweep.hpp
Expand Up @@ -11,40 +11,19 @@
namespace mlpack {
namespace tree {

constexpr double fillFactor = 0.5;

/**
* The MinimalCoverageSweep class finds a partition along which we
* can split a node according to the coverage of two resulting nodes.
* Moreover, the class evaluates the cost of each split.
* can split a node according to the coverage of two resulting nodes. The class
* finds a partition along a given axis. Moreover, the class evaluates the cost
* of each split. The cost is proportional to the total coverage of resulting
* nodes. If the resulting nodes are overflowed the maximum cost is returned.
*
* @tparam SplitPolicy The class that provides rules for inserting children of
* a node that is being split into two new subtrees.
*/
template<typename SplitPolicy>
class MinimalCoverageSweep
{
private:
/**
* Class to allow for faster sorting.
*/
template<typename ElemType>
struct SortStruct
{
ElemType d;
int n;
};

/**
* Comparator for sorting with SortStruct.
*/
template<typename ElemType>
static bool StructComp(const SortStruct<ElemType>& s1,
const SortStruct<ElemType>& s2)
{
return s1.d < s2.d;
}

public:
//! A struct that provides the type of the sweep cost.
template<typename TreeType>
Expand Down
156 changes: 45 additions & 111 deletions src/mlpack/core/tree/rectangle_tree/minimal_coverage_sweep_impl.hpp
Expand Up @@ -21,27 +21,34 @@ SweepNonLeafNode(const size_t axis,
typename TreeType::ElemType& axisCut)
{
typedef typename TreeType::ElemType ElemType;
typedef bound::HRectBound<metric::EuclideanDistance, ElemType> BoundType;

std::vector<SortStruct<ElemType>> sorted(node->NumChildren());
std::vector<std::pair<ElemType, size_t>> sorted(node->NumChildren());

for (size_t i = 0; i < node->NumChildren(); i++)
{
sorted[i].d = SplitPolicy::Bound(node->Child(i))[axis].Hi();
sorted[i].n = i;
sorted[i].first = SplitPolicy::Bound(node->Child(i))[axis].Hi();
sorted[i].second = i;
}
// Sort high bounds of children.
std::sort(sorted.begin(), sorted.end(), StructComp<ElemType>);
std::sort(sorted.begin(), sorted.end(),
[] (std::pair<ElemType, size_t>& s1,
std::pair<ElemType, size_t>& s2)
{
return s1.first < s2.first;
});

size_t splitPointer = fillFactor * node->NumChildren();
size_t splitPointer = node->NumChildren() / 2;

axisCut = sorted[splitPointer - 1].d;
axisCut = sorted[splitPointer - 1].first;

// Check if the partition is suitable.
// Check if the midpoint split is suitable.
if (!CheckNonLeafSweep(node, axis, axisCut))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you doing this check first before iterating over different values of splitPointer? I guess I don't understand the reasoning. It seems like this entire block of code is just to check that there does exist a valid split on this axis, so I guess the first if is just a shortcut to avoid checking every possible cut. If by default you want to go with a midpoint split, you can hardcode the 0.5 and remove fillFactor, I agree with that. But I guess if the default split does not work, you are trying all other possible splits until you find one that works?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Absolutely so, I'll add a comment in order to clarify that.

{
// Find any suitable partition if the default partition is not acceptable.
for (splitPointer = 1; splitPointer < sorted.size(); splitPointer++)
{
axisCut = sorted[splitPointer - 1].d;
axisCut = sorted[splitPointer - 1].first;
if (CheckNonLeafSweep(node, axis, axisCut))
break;
}
Expand All @@ -50,72 +57,24 @@ SweepNonLeafNode(const size_t axis,
return std::numeric_limits<ElemType>::max();
}

std::vector<ElemType> lowerBound1(node->Bound().Dim());
std::vector<ElemType> highBound1(node->Bound().Dim());
std::vector<ElemType> lowerBound2(node->Bound().Dim());
std::vector<ElemType> highBound2(node->Bound().Dim());
BoundType bound1(node->Bound().Dim());
BoundType bound2(node->Bound().Dim());

// Find lower and high bounds of two resulting nodes.
for (size_t k = 0; k < node->Bound().Dim(); k++)
{
lowerBound1[k] = node->Child(sorted[0].n).Bound()[k].Lo();
highBound1[k] = node->Child(sorted[0].n).Bound()[k].Hi();
// Find bounds of two resulting nodes.
for (size_t i = 0; i < splitPointer; i++)
bound1 |= node->Child(sorted[i].second).Bound();

for (size_t i = 1; i < splitPointer; i++)
{
if (node->Child(sorted[i].n).Bound()[k].Lo() < lowerBound1[k])
lowerBound1[k] = node->Child(sorted[i].n).Bound()[k].Lo();
if (node->Child(sorted[i].n).Bound()[k].Hi() > highBound1[k])
highBound1[k] = node->Child(sorted[i].n).Bound()[k].Hi();
}
for (size_t i = splitPointer; i < node->NumChildren(); i++)
bound2 |= node->Child(sorted[i].second).Bound();

lowerBound2[k] = node->Child(sorted[splitPointer].n).Bound()[k].Lo();
highBound2[k] = node->Child(sorted[splitPointer].n).Bound()[k].Hi();

for (size_t i = splitPointer + 1; i < node->NumChildren(); i++)
{
if (node->Child(sorted[i].n).Bound()[k].Lo() < lowerBound2[k])
lowerBound2[k] = node->Child(sorted[i].n).Bound()[k].Lo();
if (node->Child(sorted[i].n).Bound()[k].Hi() > highBound2[k])
highBound2[k] = node->Child(sorted[i].n).Bound()[k].Hi();
}
}

// Evaluate the cost of the split i.e. calculate the total coverage
// of two resulting nodes.

ElemType area1 = 1.0, area2 = 1.0;
ElemType overlappedArea = 1.0;

for (size_t k = 0; k < node->Bound().Dim(); k++)
{
if (lowerBound1[k] >= highBound1[k])
{
overlappedArea *= 0;
area1 *= 0;
}
else
area1 *= highBound1[k] - lowerBound1[k];

if (lowerBound2[k] >= highBound2[k])
{
overlappedArea *= 0;
area1 *= 0;
}
else
area2 *= highBound2[k] - lowerBound2[k];
ElemType area1 = bound1.Volume();
ElemType area2 = bound2.Volume();

if (lowerBound1[k] < highBound1[k] && lowerBound2[k] < highBound2[k])
{
if (lowerBound1[k] > highBound2[k] || lowerBound2[k] > highBound2[k])
overlappedArea *= 0;
else
overlappedArea *= std::min(highBound1[k], highBound2[k]) -
std::max(lowerBound1[k], lowerBound2[k]);
}
}

return area1 + area2 - overlappedArea;
return area1 + area2;
}

template<typename SplitPolicy>
Expand All @@ -126,73 +85,48 @@ SweepLeafNode(const size_t axis,
typename TreeType::ElemType& axisCut)
{
typedef typename TreeType::ElemType ElemType;
typedef bound::HRectBound<metric::EuclideanDistance, ElemType> BoundType;

std::vector<SortStruct<ElemType>> sorted(node->Count());
std::vector<std::pair<ElemType, size_t>> sorted(node->Count());

sorted.resize(node->Count());

for (size_t i = 0; i < node->NumPoints(); i++)
{
sorted[i].d = node->Dataset().col(node->Point(i))[axis];
sorted[i].n = i;
sorted[i].first = node->Dataset().col(node->Point(i))[axis];
sorted[i].second = i;
}

// Sort high bounds of children.
std::sort(sorted.begin(), sorted.end(), StructComp<ElemType>);
std::sort(sorted.begin(), sorted.end(),
[] (std::pair<ElemType, size_t>& s1,
std::pair<ElemType, size_t>& s2)
{
return s1.first < s2.first;
});

size_t splitPointer = fillFactor * node->Count();
size_t splitPointer = node->Count() / 2;

axisCut = sorted[splitPointer - 1].d;
axisCut = sorted[splitPointer - 1].first;

// Check if the partition is suitable.
if (!CheckLeafSweep(node, axis, axisCut))
return std::numeric_limits<ElemType>::max();

std::vector<ElemType> lowerBound1(node->Bound().Dim());
std::vector<ElemType> highBound1(node->Bound().Dim());
std::vector<ElemType> lowerBound2(node->Bound().Dim());
std::vector<ElemType> highBound2(node->Bound().Dim());

// Find lower and high bounds of two resulting nodes.
for (size_t k = 0; k < node->Bound().Dim(); k++)
{
lowerBound1[k] = node->Dataset().col(node->Point(sorted[0].n))[k];
highBound1[k] = node->Dataset().col(node->Point(sorted[0].n))[k];

for (size_t i = 1; i < splitPointer; i++)
{
if (node->Dataset().col(node->Point(sorted[i].n))[k] < lowerBound1[k])
lowerBound1[k] = node->Dataset().col(node->Point(sorted[i].n))[k];
if (node->Dataset().col(node->Point(sorted[i].n))[k] > highBound1[k])
highBound1[k] = node->Dataset().col(node->Point(sorted[i].n))[k];
}
BoundType bound1(node->Bound().Dim());
BoundType bound2(node->Bound().Dim());

lowerBound2[k] = node->Dataset().col(
node->Point(sorted[splitPointer].n))[k];
highBound2[k] = node->Dataset().col(node->Point(sorted[splitPointer].n))[k];
// Find bounds of two resulting nodes.
for (size_t i = 0; i < splitPointer; i++)
bound1 |= node->Dataset().col(node->Point(sorted[i].second));

for (size_t i = splitPointer + 1; i < node->NumChildren(); i++)
{
if (node->Dataset().col(node->Point(sorted[i].n))[k] < lowerBound2[k])
lowerBound2[k] = node->Dataset().col(node->Point(sorted[i].n))[k];
if (node->Dataset().col(node->Point(sorted[i].n))[k] > highBound2[k])
highBound2[k] = node->Dataset().col(node->Point(sorted[i].n))[k];
}
}
for (size_t i = splitPointer; i < node->NumChildren(); i++)
bound2 |= node->Dataset().col(node->Point(sorted[i].second));

// Evaluate the cost of the split i.e. calculate the total coverage
// of two resulting nodes.

ElemType area1 = 1.0, area2 = 1.0;
ElemType overlappedArea = 1.0;

for (size_t k = 0; k < node->Bound().Dim(); k++)
{
area1 *= highBound1[k] - lowerBound1[k];
area2 *= highBound2[k] - lowerBound2[k];
}

return area1 + area2 - overlappedArea;
return bound1.Volume() + bound2.Volume();
}

template<typename SplitPolicy>
Expand Down
Expand Up @@ -14,34 +14,17 @@ namespace tree {
/**
* The MinimalSplitsNumberSweep class finds a partition along which we
* can split a node according to the number of required splits of the node.
* Moreover, the class evaluates the cost of each split.
* The class finds a partition along a given axis. Moreover, the class evaluates
* the cost of each split. The cost is proportional to the number of required
* splits and the difference of sizes of resulting nodes. If the resulting nodes
* are overflowed the maximum cost is returned.
*
* @tparam SplitPolicy The class that provides rules for inserting children of
* a node that is being split into two new subtrees.
*/
template<typename SplitPolicy>
class MinimalSplitsNumberSweep
{
private:
/**
* Class to allow for faster sorting.
*/
template<typename ElemType>
struct SortStruct
{
ElemType d;
int n;
};

/**
* Comparator for sorting with SortStruct.
*/
template<typename ElemType>
static bool StructComp(const SortStruct<ElemType>& s1,
const SortStruct<ElemType>& s2)
{
return s1.d < s2.d;
}
public:
//! A struct that provides the type of the sweep cost.
template<typename>
Expand Down