Skip to content

Commit

Permalink
Pre bucket geo points for clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmach committed Jan 24, 2015
1 parent febd9af commit 3decab4
Show file tree
Hide file tree
Showing 4 changed files with 239 additions and 1 deletion.
95 changes: 95 additions & 0 deletions clustering/buckets.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package clustering

import "github.com/paulmach/go.geo"

type buckets struct {
southWest *geo.Point
threshold float64
width int
height int
data [][]int
}

func newBuckets(bound *geo.Bound, threshold float64) *buckets {
maxsize := 4000.0 * 4000.0

width := bound.Width() / threshold
height := bound.Height() / threshold

if width < 1 {
width = 1.0
}

if height < 1 {
height = 1.0
}

if height*width > maxsize {
factor := maxsize / (height * width)

threshold /= factor
height *= factor
width *= factor
}

return &buckets{
southWest: bound.SouthWest(),
threshold: threshold,
width: int(width),
height: int(height),
data: make([][]int, int(height)*int(width)),
}
}

func (b *buckets) position(p *geo.Point) (int, int) {
x := int((p[0] - b.southWest[0]) / b.threshold)
if x >= b.width {
x = b.width - 1
}

if x < 0 {
x = 0
}

y := int((p[1] - b.southWest[1]) / b.threshold)
if y >= b.height {
y = b.height - 1
}

if y < 0 {
y = 0
}

return x, y
}

func (b *buckets) Add(p *geo.Point, index int) {
x, y := b.position(p)
i := y*b.width + x

// log.Printf("%v %v %v %v %v %v", x, y, b.width, b.height, len(b.data), i)
b.data[i] = append(b.data[i], index)
}

func (b *buckets) Near(p *geo.Point) [][]int {
x, y := b.position(p)
result := make([][]int, 0, 9)

for i := x - 1; i <= x+1; i++ {
if i < 0 || i >= b.width {
continue
}
for j := y - 1; j <= y+1; j++ {
if j < 0 || j >= b.height {
continue
}

loc := j*b.width + i
if b.data[loc] != nil {
result = append(result, b.data[loc])
}
}
}

return result
}
87 changes: 87 additions & 0 deletions clustering/buckets_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package clustering

import (
"testing"

"github.com/paulmach/go.geo"
)

func TestNewBuckets(t *testing.T) {
bound := geo.NewBound(0, 4000, 0, 2000)

buckets := newBuckets(bound, 1)
if w := buckets.width; w != 4000 {
t.Errorf("incorrect width, got %v", w)
}

if h := buckets.height; h != 2000 {
t.Errorf("incorrect height, got %v", h)
}

if v := buckets.threshold; v != 1 {
t.Errorf("incorrect threshold, got %v", v)
}

buckets = newBuckets(bound, 0.5)
if w := buckets.width; w != 4000 {
t.Errorf("incorrect width, got %v", w)
}

if h := buckets.height; h != 2000 {
t.Errorf("incorrect height, got %v", h)
}

if v := buckets.threshold; v != 1 {
t.Errorf("incorrect threshold, got %v", v)
}
}

func TestBucketsAdd(t *testing.T) {
bound := geo.NewBound(0, 4000, 0, 2000)
buckets := newBuckets(bound, 1)

buckets.Add(geo.NewPoint(0, 0), 1)

if len(buckets.data[0]) != 1 {
t.Errorf("bucket not saved to correct location")
}

buckets.Add(bound.NorthEast(), 2)

if len(buckets.data[len(buckets.data)-1]) != 1 {
t.Errorf("bucket not saved to correct location")
}
}

func TestBucketsNear(t *testing.T) {
bound := geo.NewBound(0, 4000, 0, 2000)
buckets := newBuckets(bound, 1)

buckets.Add(geo.NewPoint(0, 0), 1)

near := buckets.Near(geo.NewPoint(0, 0))
if l := len(near); l != 1 {
t.Errorf("should not include empty buckets")
}

buckets.Add(geo.NewPoint(1.5, 1.5), 2)
near = buckets.Near(geo.NewPoint(1.5, 1.5))
if l := len(near); l != 2 {
t.Errorf("should not include empty buckets")
}

near = buckets.Near(geo.NewPoint(1.5, 1.5))
if l := len(near); l != 2 {
t.Errorf("should not include empty buckets")
}

near = buckets.Near(geo.NewPoint(2.5, 2.5))
if l := len(near); l != 1 {
t.Errorf("should not include empty buckets")
}

if near[0][0] != 2 {
t.Errorf("returned wrong cluster")
}

}
44 changes: 43 additions & 1 deletion clustering/clustering.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ func geocluster(clusters []*Cluster, threshold float64) []*Cluster {
clusters,
// Default intialization, TODO: better bucketing/prefiltering will greatly increase performance.
// can use the bound above to help with this.
initClusterDistances(clusters, CentroidSquaredDistance{}, scaledThreshold),
// initClusterDistances(clusters, CentroidSquaredDistance{}, scaledThreshold),
initGeoClusterDistances(bound, clusters, CentroidSquaredDistance{}, scaledThreshold),
CentroidSquaredDistance{},
scaledThreshold,
)
Expand Down Expand Up @@ -161,6 +162,47 @@ func initClusterDistances(
return distances
}

func initGeoClusterDistances(
bound *geo.Bound,
clusters []*Cluster,
distancer ClusterDistancer,
threshold float64,
) []*distanceSet {

buckets := newBuckets(bound, threshold)
for i, c := range clusters {
buckets.Add(c.Centroid, i)
}

// initialize distances
distances := make([]*distanceSet, len(clusters))
for i, cluster := range clusters {
if distances[i] == nil {
distances[i] = newDistanceSet()
}
distances[i].Set(i, math.MaxInt32)

near := buckets.Near(cluster.Centroid)
for _, n1 := range near {
for _, j := range n1 {
if i <= j {
continue
}

dist := distancer.ClusterDistance(cluster, clusters[j])
if distances[j] == nil {
distances[j] = newDistanceSet()
}

distances[j].Set(i, dist)
distances[i].Set(j, dist)
}
}
}

return distances
}

func clusterClusters(
clusters []*Cluster,
distanceSets []*distanceSet,
Expand Down
14 changes: 14 additions & 0 deletions clustering/clustering_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,20 @@ func BenchmarkClusterGeoClusters(b *testing.B) {
}
}

// > go test -c && ./clustering.test -test.bench=ClusterGeoPointers -test.cpuprofile=cpu.out -test.benchtime=10s
// > go tool pprof clustering.test cpu.out
func BenchmarkClusterGeoPointers(b *testing.B) {
_, pointers := loadPrefilteredTestClusters(b)

b.ResetTimer()
for i := 0; i < b.N; i++ {
cs := ClusterGeoPointers(pointers, 30)
if len(cs) != 26 {
b.Fatalf("incorrect number of clusters, got %v", len(cs))
}
}
}

func BenchmarkInitClusterDistances(b *testing.B) {
clusters, _ := loadPrefilteredTestClusters(b)

Expand Down

0 comments on commit 3decab4

Please sign in to comment.