Skip to content

Commit

Permalink
Added data generated from a normal distribution using Box-Muller
Browse files Browse the repository at this point in the history
Transform.
Fixed up documentation with the new results.
  • Loading branch information
Michael Pigott committed Jun 24, 2014
1 parent f2831db commit d2df14d
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 29 deletions.
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -31,7 +31,7 @@ Likewise, the `CyclePartitioner` partitions in `O(N)` time complexity in the bes

I have found that larger classes lessen the number of total moves to partition the data. This makes intuitive sense because the larger the average class size, the less the cascading effect when one class fills up.

I have also found, surprisingly, that despite the `CdfPartitionFunction`'s additional complexity, it does not perform significantly better than the `FlashSortPartitionFunction` in the number of moves. In fact, when the number of samples taken is much smaller than the number of elements, I have found the `FlashSortPartitionFunction` to perform much better - both in the standard deviation of class size, and in total number of moves required to partition the data.
I have found that the `CdfPartitionFunction` is only significantly better (in class-size standard deviation and wall-clock performance) than the `FlashSortPartitionFunction` when the data is normally distributed. Both show similar wall-clock performance, while the `FlashSortPartitionFunction` shows better class-size standard deviation, in evenly-distributed data.

## External Libraries

Expand Down
10 changes: 8 additions & 2 deletions algorithms/src/mpigott/sort/CdfPartitionFunction.java
Expand Up @@ -108,7 +108,7 @@ public CdfPartitionFunction(List<T> input, int cellSize, double alpha, double cd
}
}

perCellRange = (max.distance(min) + 1.0) / numCells;
perCellRange = max.distance(min) / numCells;

int[] sampleCountsPerCell = new int[(int) numCells];

Expand Down Expand Up @@ -173,7 +173,13 @@ public int getClass(T element) {
final double x = value - prevRange;
final double px = slope * x + prevCdf;

return (int) (px * numCells);
double classification = (int)(px * numCells);

if (classification >= numCells) {
classification = (numCells - 1.0);
}

return (int) classification;
}

/**
Expand Down
158 changes: 132 additions & 26 deletions algorithms/test/mpigott/sort/CdfDataPartitionSortTest.java
Expand Up @@ -60,9 +60,9 @@ public void testPartitionFunction3() {
SummaryStatistics statistics = getBucketStatistics(input, func);

assertEquals(300, statistics.getN());
assertTrue(statistics.getStandardDeviation() < 1.0);
assertEquals(99, (int) statistics.getMin());
assertEquals(101, (int) statistics.getMax());
assertEquals( 0, (int) statistics.getStandardDeviation());
assertEquals(100, (int) statistics.getMin());
assertEquals(100, (int) statistics.getMax());
}

@Test
Expand All @@ -75,9 +75,9 @@ public void testPartitionFunction4() {
SummaryStatistics statistics = getBucketStatistics(input, func);

assertEquals(100, statistics.getN());
assertTrue(statistics.getStandardDeviation() < 1.0);
assertEquals(99, (int) statistics.getMin());
assertEquals(101, (int) statistics.getMax());
assertEquals( 0, (int) statistics.getStandardDeviation());
assertEquals(100, (int) statistics.getMin());
assertEquals(100, (int) statistics.getMax());
}

@Test
Expand Down Expand Up @@ -141,36 +141,75 @@ public void testRandomPartitioning3() {
}

@Test
public void testVariousSorters() {
ArrayList<NumericElement<Double>> cdfPartitionInput = createRandomInput(1000000, -250000.0, 1250000.0);
public void testCdfRanges() {
ArrayList<NumericElement<Double>> cdfPartitionInput1 = createStandardNormalRandomInput(1000000, 1250000.0);
ArrayList<NumericElement<Double>> cdfPartitionInput2 = (ArrayList<NumericElement<Double>>) cdfPartitionInput1.clone();

CdfPartitionFunction<NumericElement<Double>, Double> cdfPartition1Func =
new CdfPartitionFunction<NumericElement<Double>, Double>(cdfPartitionInput1, 10000, 0.05, 0.0056);

int[] cdfPartition1ClassBounds = CyclePartitioner.partition(cdfPartitionInput1, cdfPartition1Func);

SummaryStatistics cdfPartition1Statistics = getClassBoundsStatistics(cdfPartition1ClassBounds);

CdfPartitionFunction<NumericElement<Double>, Double> cdfPartition2Func =
new CdfPartitionFunction<NumericElement<Double>, Double>(cdfPartitionInput2, 1000, 0.05, 0.0056);

int[] cdfPartition2ClassBounds = CyclePartitioner.partition(cdfPartitionInput2, cdfPartition2Func);

SummaryStatistics cdfPartition2Statistics = getClassBoundsStatistics(cdfPartition2ClassBounds);

assertTrue("The relative standard deviation of the output with larger class sizes should be smaller than the relative standard deviation of the output with smaller class sizes.", (cdfPartition1Statistics.getStandardDeviation() / cdfPartition1Statistics.getMean()) < (cdfPartition2Statistics.getStandardDeviation() / cdfPartition2Statistics.getMean()));
}

@Test
public void testVariousSortersRandomInput() {
ArrayList<NumericElement<Double>> cdfPartitionInput = createRandomInput(1000000, 0.0, 15000.0);
ArrayList<NumericElement<Double>> flashSortInput = (ArrayList<NumericElement<Double>>) cdfPartitionInput.clone();

CdfPartitionFunction<NumericElement<Double>, Double> cdfPartitionFunc =
new CdfPartitionFunction<NumericElement<Double>, Double>(cdfPartitionInput, 1000, 0.05, 0.0056);
new CdfPartitionFunction<NumericElement<Double>, Double>(cdfPartitionInput, 10000, 0.05, 0.0056);

int[] cdfPartitionClassBounds = CyclePartitioner.partition(cdfPartitionInput, cdfPartitionFunc);

SummaryStatistics cdfPartitionStatistics = new SummaryStatistics();
int prevClassBound = 0;
for (int classBound : cdfPartitionClassBounds) {
cdfPartitionStatistics.addValue(classBound - prevClassBound);
prevClassBound = classBound;
}
checkInput(cdfPartitionInput, cdfPartitionFunc, cdfPartitionClassBounds);

FlashSortPartitionFunction<NumericElement<Double>, Double> fsPartitionFunc =
new FlashSortPartitionFunction<NumericElement<Double>, Double>(flashSortInput, 1000);
new FlashSortPartitionFunction<NumericElement<Double>, Double>(flashSortInput, 100);

int[] fsClassBounds = CyclePartitioner.partition(flashSortInput, fsPartitionFunc);

SummaryStatistics fsPartitionStatistics = new SummaryStatistics();
prevClassBound = 0;
for (int classBound : fsClassBounds) {
fsPartitionStatistics.addValue(classBound - prevClassBound);
prevClassBound = classBound;
}
checkInput(flashSortInput, fsPartitionFunc, fsClassBounds);

System.out.println(cdfPartitionStatistics);
System.out.println(fsPartitionStatistics);
SummaryStatistics cdfPartitionStatistics = getClassBoundsStatistics(cdfPartitionClassBounds);
SummaryStatistics flashSortStatistics = getClassBoundsStatistics(fsClassBounds);

assertTrue("For evenly-distributed random input, the standard deviation of flash sort should be smaller.", flashSortStatistics.getStandardDeviation() < cdfPartitionStatistics.getStandardDeviation());
}

@Test
public void testVariousSortersStandardNormalRandomInput() {
ArrayList<NumericElement<Double>> cdfPartitionInput = createStandardNormalRandomInput(1000000, 9000.0);
ArrayList<NumericElement<Double>> flashSortInput = (ArrayList<NumericElement<Double>>) cdfPartitionInput.clone();

CdfPartitionFunction<NumericElement<Double>, Double> cdfPartitionFunc =
new CdfPartitionFunction<NumericElement<Double>, Double>(cdfPartitionInput, 10000, 0.05, 0.0056);

int[] cdfPartitionClassBounds = CyclePartitioner.partition(cdfPartitionInput, cdfPartitionFunc);

checkInput(cdfPartitionInput, cdfPartitionFunc, cdfPartitionClassBounds);

FlashSortPartitionFunction<NumericElement<Double>, Double> fsPartitionFunc =
new FlashSortPartitionFunction<NumericElement<Double>, Double>(flashSortInput, 100);

int[] fsClassBounds = CyclePartitioner.partition(flashSortInput, fsPartitionFunc);

checkInput(flashSortInput, fsPartitionFunc, fsClassBounds);

SummaryStatistics cdfPartitionStatistics = getClassBoundsStatistics(cdfPartitionClassBounds);
SummaryStatistics flashSortStatistics = getClassBoundsStatistics(fsClassBounds);

assertTrue("For evenly-distributed random input, the standard deviation of CDF-based partitioning should be smaller.", flashSortStatistics.getStandardDeviation() > cdfPartitionStatistics.getStandardDeviation());
}

private ArrayList<NumericElement<Double>> createNonRandomInput(int numElements, double min) {
Expand Down Expand Up @@ -201,6 +240,25 @@ private ArrayList<NumericElement<Double>> createRandomInput(int numElems, double
return input;
}

private ArrayList<NumericElement<Double>> createStandardNormalRandomInput(int numElems, double variance) {
StandardNormalRandomNumberGenerator stdNormRand = new StandardNormalRandomNumberGenerator(variance);

ArrayList<NumericElement<Double>> input = new ArrayList<NumericElement<Double>>(numElems);
for (int i = 0; i < numElems; ++i) {
input.add(new NumericElement<Double>(Double.NEGATIVE_INFINITY));
}
Random random = new Random(System.currentTimeMillis());
for (int i = 0; i < numElems; ++i) {
int index = random.nextInt(numElems);
while (input.get(index % numElems).getValue() > Double.NEGATIVE_INFINITY) {
++index;
}
input.set(index % numElems, new NumericElement<Double>(stdNormRand.getNextRandom()));
}

return input;
}

private <T extends Element<U>, U> SummaryStatistics getBucketStatistics(ArrayList<T> input, CdfPartitionFunction<T, U> func) {
TreeMap<Integer, Double> bucketSizes = new TreeMap<Integer, Double>();
for (int i = 0; i < input.size(); ++i) {
Expand All @@ -221,17 +279,27 @@ private <T extends Element<U>, U> SummaryStatistics getBucketStatistics(ArrayLis
return statistics;
}

private SummaryStatistics getClassBoundsStatistics(int[] classBounds) {
SummaryStatistics classBoundsStatistics = new SummaryStatistics();
int prevClassBound = 0;
for (int classBound : classBounds) {
classBoundsStatistics.addValue(classBound - prevClassBound);
prevClassBound = classBound;
}
return classBoundsStatistics;
}

private void checkInput(ArrayList<NumericElement<Double>> output, PartitionFunction<NumericElement<Double>, Double> func, int[] classBounds) {
assertNotNull(classBounds);

// Confirm the max value of a class is less than the min value of the next upper class.
double[] prevMinAndMax = getMinAndMax(output, classBounds, 0);
double[] currMinAndMax = null;

assertEquals("The element in the upper bound of the class 0 does not belong to class zero.", 0, func.getClass(output.get(classBounds[0])));
assertTrue("The element in the upper bound of the class 0 does not belong to class zero.", (func.getClass(output.get(classBounds[0])) == 0) || (classBounds[0] == 0));

for (int i = 1; i < classBounds.length; ++i) {
assertEquals("The element in the upper bound of the class " + i + " does not belong to class " + i + ".", i, func.getClass(output.get(classBounds[i])));
assertTrue("The element in the upper bound of the class " + i + " does not belong to class " + i + ".", (func.getClass(output.get(classBounds[i])) == i) || (classBounds[i] == classBounds[i - 1]));
currMinAndMax = getMinAndMax(output, classBounds, i);
final boolean isValid = prevMinAndMax[1] <= currMinAndMax[0];
assertTrue("Maximum of class " + (i - 1) + " (" + prevMinAndMax[1] + ") must be less than the min of class " + i + " (" + currMinAndMax[0] + ").", isValid);
Expand Down Expand Up @@ -276,4 +344,42 @@ private double[] getMinAndMax(ArrayList<NumericElement<Double>> input, int[] cla

return minAndMax;
}

// From http://en.wikipedia.org/wiki/Box_Muller_transform
private static class StandardNormalRandomNumberGenerator {
private static final double TWO_PI = 6.2831853071795864769252866;

public StandardNormalRandomNumberGenerator(double variance) {
this.hasSpare = false;
this.rand1 = 0.0;
this.rand2 = 0.0;
this.random = new Random(System.currentTimeMillis());
this.variance = variance;
}

public double getNextRandom() {
if (hasSpare) {
hasSpare = false;
return Math.sqrt(variance * rand1) + Math.sin(rand2);
}

hasSpare = true;

rand1 = random.nextDouble();
if (rand1 < 1e-100) {
rand1 = 1e-100;
}
rand1 = -2 * Math.log(rand1);

rand2 = random.nextDouble() * TWO_PI;

return Math.sqrt(variance * rand1) * Math.cos(rand2);
}

private boolean hasSpare;
private double rand1;
private double rand2;
private Random random;
private double variance;
}
}

0 comments on commit d2df14d

Please sign in to comment.