Permalink
Browse files

first commit

  • Loading branch information...
0 parents commit 0082ae92552fcb41eabb36c591c8aa0e618dcf95 Marcin Bielak committed Jan 25, 2010
Showing with 8,538 additions and 0 deletions.
  1. +11 −0 K-Means_clustering/README
  2. +132 −0 K-Means_clustering/cluster.php
  3. +132 −0 K-Means_clustering/clustering.py
  4. BIN K-Means_clustering/masterThesis-VR.pdf
  5. +163 −0 K-Means_clustering/with_normalize.out.txt
  6. +163 −0 K-Means_clustering/without_normalize.out.txt
  7. +1 −0 README
  8. +41 −0 comparing_images_with_python/pil-comparing-images.htm
  9. BIN games_art/jacIII07.pdf
  10. BIN theory/ BarronEtAl_CICLing09.pdf
  11. +117 −0 theory/CVXMOD – Convex optimization software in Python.html
  12. +261 −0 theory/Calculating distance between two documents.html
  13. +261 −0 theory/Clustering web pages into similar topics.html
  14. +282 −0 theory/Compare two PIL images in Python - Python - Snipplr.html
  15. BIN ...ndations of Statistical Natural Language Processing - Christopher D. Manning - Chapter 1.pdf.maff
  16. +612 −0 theory/K-Means Clustering in PHP at Code Blip.html
  17. BIN theory/LaTourette_LMM.pdf
  18. BIN theory/Maximum Entropy Modeling Toolkit for Python and C++ manual.pdf
  19. BIN theory/Modelling image complexity by independent component analysis Perkio09ICANN.pdf
  20. +1,410 −0 theory/PyMix - Python Mixture Package mixture-module.html
  21. 0 theory/Resources.htm
  22. +3,092 −0 theory/Searching-Eye.html
  23. BIN theory/Testing the Significance of Attribute Interactions jakulin-bratko-ICML2004-P.pdf
  24. BIN theory/Tiny Corpus Applications with Transformation-Based Error-Driven oral-17.pdf
  25. BIN theory/Utilizing Folksonomy: Similarity Metadata from the Del.icio.us web-project.pdf
  26. +1,021 −0 theory/compare-two-images-the-python-linux-way.html
  27. +839 −0 theory/k-means-clustering-whats-wrong-php.html
@@ -0,0 +1,11 @@
+
+Links:
+
+http://blogs.sun.com/yongsun/entry/k_means_and_k_means
+http://www.solcoproject.net/?secid=6&pid=30
+http://sourceforge.net/search/?type_of_search=soft&words=k-means&search=Search
+http://www.inb.mu-luebeck.de/biosoft/biopython/api/Bio/Tools/Clustering/kMeans.py.html
+http://www.inb.mu-luebeck.de/biosoft/biopython/api/index.html
+http://www.phpandme.net/2009/03/calculating-distance-between-two-documents/
+http://www.phpandme.net/2009/03/bayesian-text-classification/
+http://people.revoledu.com/kardi/tutorial/kMean/Resources.htm
@@ -0,0 +1,132 @@
+<?php
+/**
+ * K-means clustering with centroid and normalize value
+ *
+ * @see http://phpir.com/clustering
+ * @see
+ */
+
+define(WITH_NORMALIZE, false);
+
+$data = array(
+ array(0.05, 0.95),
+ array(0.1, 0.9),
+ array(0.2, 0.8),
+ array(0.25, 0.75),
+ array(0.45, 0.55),
+ array(0.5, 0.5),
+ array(0.55, 0.45),
+ array(0.85, 0.15),
+ array(0.9, 0.1),
+ array(0.95, 0.05)
+);
+
+
+var_dump(kMeans($data, 3, WITH_NORMALIZE));
+
+function initialiseCentroids(array $data, $k, $normalize = false) {
+ $dimensions = count($data[0]);
+ $centroids = array();
+ $dimmax = array();
+ $dimmin = array();
+ foreach($data as $document) {
+ foreach($document as $dim => $val) {
+ if(!isset($dimmax[$dim]) || $val > $dimmax[$dim]) {
+ $dimmax[$dim] = $val;
+ }
+ if(!isset($dimmin[$dim]) || $val < $dimmin[$dim]) {
+ $dimmin[$dim] = $val;
+ }
+ }
+ }
+ for($i = 0; $i < $k; $i++) {
+ $centroids[$i] = initialiseCentroid($dimensions, $dimmax, $dimmin, $normalize);
+ }
+ return $centroids;
+}
+
+function initialiseCentroid($dimensions, $dimmax, $dimmin, $normalize = false) {
+ $total = 0;
+ $centroid = array();
+ for($j = 0; $j < $dimensions; $j++) {
+ $total += $centroid[$j] = (rand($dimmin[$j] * 1000, $dimmax[$j] * 1000));
+ }
+
+ $centroid = ( false === $normalize ? $centroid : normaliseValue($centroid, $total) );
+
+ return $centroid;
+}
+
+function kMeans($data, $k, $normalize = false) {
+ $centroids = initialiseCentroids($data, $k, $normalize = false);
+ $mapping = array();
+
+ while(true) {
+ $new_mapping = assignCentroids($data, $centroids);
+ foreach($new_mapping as $documentID => $centroidID) {
+ if(!isset($mapping[$documentID]) || $centroidID != $mapping[$documentID]) {
+ $mapping = $new_mapping;
+ break;
+ } else {
+ return formatResults($mapping, $data, $centroids);
+ }
+ }
+ $centroids = updateCentroids($mapping, $data, $k);
+ }
+}
+
+function formatResults($mapping, $data, $centroids) {
+ $result = array();
+ $result['centroids'] = $centroids;
+ foreach($mapping as $documentID => $centroidID) {
+ $result[$centroidID][] = implode(',', $data[$documentID]);
+ }
+ return $result;
+}
+
+function assignCentroids($data, $centroids) {
+ $mapping = array();
+
+ foreach($data as $documentID => $document) {
+ $minDist = 100;
+ $minCentroid = null;
+ foreach($centroids as $centroidID => $centroid) {
+ $dist = 0;
+ foreach($centroid as $dim => $value) {
+ $dist += abs($value - $document[$dim]);
+ }
+ if($dist < $minDist) {
+ $minDist = $dist;
+ $minCentroid = $centroidID;
+ }
+ }
+ $mapping[$documentID] = $minCentroid;
+ }
+
+ return $mapping;
+}
+
+function updateCentroids($mapping, $data, $k) {
+ $centroids = array();
+ $counts = array_count_values($mapping);
+
+ foreach($mapping as $documentID => $centroidID) {
+ foreach($data[$documentID] as $dim => $value) {
+ $centroids[$centroidID][$dim] += ($value/$counts[$centroidID]);
+ }
+ }
+
+ if(count($centroids) < $k) {
+ $centroids = array_merge($centroids, initialiseCentroids($data, $k - count($centroids)));
+ }
+
+ return $centroids;
+}
+
+function normaliseValue(array $vector, $total) {
+ foreach($vector as &$value) {
+ $value = $value/$total;
+ }
+ return $vector;
+}
+
@@ -0,0 +1,132 @@
+
+# clustering.py contains classes and functions that cluster data points
+import sys, math, random
+
+
+# -- The Point class represents points in n-dimensional space
+class Point:
+ # Instance variables
+ # self.coords is a list of coordinates for this Point
+ # self.n is the number of dimensions this Point lives in (ie, its space)
+ # self.reference is an object bound to this Point
+ # Initialize new Points
+ def __init__(self, coords, reference=None):
+ self.coords = coords
+ self.n = len(coords)
+ self.reference = reference
+ # Return a string representation of this Point
+ def __repr__(self):
+ return str(self.coords)
+
+
+# -- The Cluster class represents clusters of points in n-dimensional space
+class Cluster:
+ # Instance variables
+ # self.points is a list of Points associated with this Cluster
+ # self.n is the number of dimensions this Cluster's Points live in
+ # self.centroid is the sample mean Point of this Cluster
+ def __init__(self, points):
+ # We forbid empty Clusters (they don't make mathematical sense!)
+ if len(points) == 0: raise Exception("ILLEGAL: EMPTY CLUSTER")
+ self.points = points
+ self.n = points[0].n
+ # We also forbid Clusters containing Points in different spaces
+ # Ie, no Clusters with 2D Points and 3D Points
+ for p in points:
+ if p.n != self.n: raise Exception("ILLEGAL: MULTISPACE CLUSTER")
+ # Figure out what the centroid of this Cluster should be
+ self.centroid = self.calculateCentroid()
+ # Return a string representation of this Cluster
+ def __repr__(self):
+ return str(self.points)
+ # Update function for the K-means algorithm
+ # Assigns a new list of Points to this Cluster, returns centroid difference
+ def update(self, points):
+ old_centroid = self.centroid
+ self.points = points
+ self.centroid = self.calculateCentroid()
+ return getDistance(old_centroid, self.centroid)
+ # Calculates the centroid Point - the centroid is the sample mean Point
+ # (in plain English, the average of all the Points in the Cluster)
+ def calculateCentroid(self):
+ centroid_coords = []
+ # For each coordinate:
+ for i in range(self.n):
+ # Take the average across all Points
+ centroid_coords.append(0.0)
+ for p in self.points:
+ centroid_coords[i] = centroid_coords[i]+p.coords[i]
+ centroid_coords[i] = centroid_coords[i]/len(self.points)
+ # Return a Point object using the average coordinates
+ return Point(centroid_coords)
+
+
+# -- Return Clusters of Points formed by K-means clustering
+def kmeans(points, k, cutoff):
+ # Randomly sample k Points from the points list, build Clusters around them
+ initial = random.sample(points, k)
+ clusters = []
+ for p in initial: clusters.append(Cluster([p]))
+ # Enter the program loop
+ while True:
+ # Make a list for each Cluster
+ lists = []
+ for c in clusters: lists.append([])
+ # For each Point:
+ for p in points:
+ # Figure out which Cluster's centroid is the nearest
+ smallest_distance = getDistance(p, clusters[0].centroid)
+ index = 0
+ for i in range(len(clusters[1:])):
+ distance = getDistance(p, clusters[i+1].centroid)
+ if distance < smallest_distance:
+ smallest_distance = distance
+ index = i+1
+ # Add this Point to that Cluster's corresponding list
+ lists[index].append(p)
+ # Update each Cluster with the corresponding list
+ # Record the biggest centroid shift for any Cluster
+ biggest_shift = 0.0
+ for i in range(len(clusters)):
+ shift = clusters[i].update(lists[i])
+ biggest_shift = max(biggest_shift, shift)
+ # If the biggest centroid shift is less than the cutoff, stop
+ if biggest_shift < cutoff: break
+ # Return the list of Clusters
+ return clusters
+
+
+# -- Get the Euclidean distance between two Points
+def getDistance(a, b):
+ # Forbid measurements between Points in different spaces
+ if a.n != b.n: raise Exception("ILLEGAL: NON-COMPARABLE POINTS")
+ # Euclidean distance between a and b is sqrt(sum((a[i]-b[i])^2) for all i)
+ ret = 0.0
+ for i in range(a.n):
+ ret = ret+pow((a.coords[i]-b.coords[i]), 2)
+ return math.sqrt(ret)
+
+# -- Create a random Point in n-dimensional space
+def makeRandomPoint(n, lower, upper):
+ coords = []
+ for i in range(n): coords.append(random.uniform(lower, upper))
+ return Point(coords)
+
+# -- Main function
+def main(args):
+ num_points, n, k, cutoff, lower, upper = 10, 2, 3, 0.5, -200, 200
+ # Create num_points random Points in n-dimensional space
+ points = []
+ for i in range(num_points): points.append(makeRandomPoint(n, lower, upper))
+ # Cluster the points using the K-means algorithm
+ clusters = kmeans(points, k, cutoff)
+ # Print the results
+ print "\nPOINTS:"
+ for p in points: print "P:", p
+ print "\nCLUSTERS:"
+ for c in clusters: print "C:", c
+
+# -- The following code executes upon command-line invocation
+if __name__ == "__main__":
+ main(sys.argv)
+
Binary file not shown.
Oops, something went wrong.

0 comments on commit 0082ae9

Please sign in to comment.