Initial commit; Only supports simple linear regression for now

luccastera · Oct 19, 2014 · 3915a07 · 3915a07
commit 3915a07
Show file tree

Hide file tree

Showing 9 changed files with 558 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.DS_Store
+node_modules
diff --git a/History.md b/History.md
@@ -0,0 +1,4 @@
+0.0.1 / 2014-10-19
+==================
+
+  * Initial release
diff --git a/README.md b/README.md
@@ -0,0 +1,25 @@
+# shaman
+
+Machine Learning library for node.js
+
+## Linear Regression
+
+Linear Regression is implemented using linear algebra and the normal equation.
+
+### Usage
+
+```javascript
+var X = [1, 2, 3, 4, 5];
+var Y = [2, 2, 3, 3, 5];
+var lr = new LinearRegression(X,Y);
+lr.train(function(err) {
+  if (err) { throw err; }
+
+  // you can now start using lr.predict:
+  console.log(lr.predict(1));
+});
+
+### Example
+
+[Click here](https://plot.ly/~luccastera/2) to see an example of Simple Linear Regression
+to evaluate the price of cars based on their horsepower that was done with the shaman library.
diff --git a/examples/cars.data b/examples/cars.data
diff --git a/examples/cars.js b/examples/cars.js
@@ -0,0 +1,79 @@
+var csvParse = require('csv-parse'),
+    fs = require('fs'),
+    LinearRegression = require('../index').LinearRegression,
+    _ = require('underscore');
+
+var apiKey = process.env.PLOTLY_API_KEY;
+var username = process.env.PLOTLY_USERNAME;
+var plotly = require('plotly')(username,apiKey);
+
+fs.readFile('./examples/cars.data', 'utf8', function(err, dataStr) {
+  if (err) {
+    console.log(err);
+    process.exit(1);
+  }
+  csvParse(dataStr, {delimiter: ',', auto_parse: true}, function(err, data) {
+    // First, clean up the training data by eliminating rows that have invalid values
+    var cleanData = _.filter(data, function(d) { return typeof d[0] === 'number' && typeof d[1] === 'number'; });
+
+    // We are only going two columns:
+    //     x: horsepower
+    //     y: price of car
+    var xAndY = cleanData.map(function(h) { return [h[21], h[25]]; });
+    var x = cleanData.map(function(h) { return h[21]; }); // x is horsepower
+    var y = cleanData.map(function(h) { return h[25]; }); // y is price
+
+
+    // Initialize and train the linear regression
+    var lr = new LinearRegression(x, y);
+    lr.train(function(err) {
+      if (err) {
+        console.log('error', err);
+        process.exit(2);
+      }
+
+      // Use the linear regression function to get a set of data to graph the linear regression line
+      var y2 = [];
+      x.forEach(function(xi) {
+        y2.push(lr.predict(xi));
+      });
+
+      // Create scatter plots of training data + linear regression function
+      var layout = {
+        title: 'Car Prices vs Horsepower',
+        xaxis: {
+          title: 'Horsepower'
+        },
+        yaxis: {
+          title: 'Price in $'
+        }
+      };
+      var trace1 = {
+        x: x,
+        y: y,
+        name: 'Training Data',
+        mode: "markers",
+        type: "scatter"
+      };
+      var trace2 = {
+        x: x,
+        y: y2,
+        name: 'Linear Regression',
+        mode: "lines",
+        type: "scatter"
+      };
+      var plotData = [trace1, trace2];
+      var graphOptions = {layout: layout,filename: "cars-linear-regression-with-shaman", fileopt: "overwrite"}
+      plotly.plot(plotData, graphOptions, function (err, msg) {
+        if (err) {
+          console.log(err);
+          process.exit(3);
+        } else {
+          console.log('Success! The plot (' + msg.filename + ') can be found at ' + msg.url);
+          process.exit();
+        }
+      });
+    });
+  });
+});
+
diff --git a/examples/cars.names b/examples/cars.names
@@ -0,0 +1,103 @@
+1. Title: 1985 Auto Imports Database
+
+2. Source Information:
+   -- Creator/Donor: Jeffrey C. Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)
+   -- Date: 19 May 1987
+   -- Sources:
+     1) 1985 Model Import Car and Truck Specifications, 1985 Ward's
+        Automotive Yearbook.
+     2) Personal Auto Manuals, Insurance Services Office, 160 Water
+        Street, New York, NY 10038 
+     3) Insurance Collision Report, Insurance Institute for Highway
+        Safety, Watergate 600, Washington, DC 20037
+
+3. Past Usage:
+   -- Kibler,~D., Aha,~D.~W., \& Albert,~M. (1989).  Instance-based prediction
+      of real-valued attributes.  {\it Computational Intelligence}, {\it 5},
+      51--57.
+	 -- Predicted price of car using all numeric and Boolean attributes
+	 -- Method: an instance-based learning (IBL) algorithm derived from a
+	    localized k-nearest neighbor algorithm.  Compared with a
+	    linear regression prediction...so all instances
+	    with missing attribute values were discarded.  This resulted with
+	    a training set of 159 instances, which was also used as a test
+	    set (minus the actual instance during testing).
+	 -- Results: Percent Average Deviation Error of Prediction from Actual
+	    -- 11.84% for the IBL algorithm
+	    -- 14.12% for the resulting linear regression equation
+
+4. Relevant Information:
+   -- Description
+      This data set consists of three types of entities: (a) the
+      specification of an auto in terms of various characteristics, (b)
+      its assigned insurance risk rating, (c) its normalized losses in use
+      as compared to other cars.  The second rating corresponds to the
+      degree to which the auto is more risky than its price indicates.
+      Cars are initially assigned a risk factor symbol associated with its
+      price.   Then, if it is more risky (or less), this symbol is
+      adjusted by moving it up (or down) the scale.  Actuarians call this
+      process "symboling".  A value of +3 indicates that the auto is
+      risky, -3 that it is probably pretty safe.
+
+      The third factor is the relative average loss payment per insured
+      vehicle year.  This value is normalized for all autos within a
+      particular size classification (two-door small, station wagons,
+      sports/speciality, etc...), and represents the average loss per car
+      per year.
+
+   -- Note: Several of the attributes in the database could be used as a
+            "class" attribute.
+
+5. Number of Instances: 205
+
+6. Number of Attributes: 26 total
+   -- 15 continuous
+   -- 1 integer
+   -- 10 nominal
+
+7. Attribute Information:     
+     Attribute:                Attribute Range:
+     ------------------        -----------------------------------------------
+  1. symboling:                -3, -2, -1, 0, 1, 2, 3.
+  2. normalized-losses:        continuous from 65 to 256.
+  3. make:                     alfa-romero, audi, bmw, chevrolet, dodge, honda,
+                               isuzu, jaguar, mazda, mercedes-benz, mercury,
+                               mitsubishi, nissan, peugot, plymouth, porsche,
+                               renault, saab, subaru, toyota, volkswagen, volvo
+  4. fuel-type:                diesel, gas.
+  5. aspiration:               std, turbo.
+  6. num-of-doors:             four, two.
+  7. body-style:               hardtop, wagon, sedan, hatchback, convertible.
+  8. drive-wheels:             4wd, fwd, rwd.
+  9. engine-location:          front, rear.
+ 10. wheel-base:               continuous from 86.6 120.9.
+ 11. length:                   continuous from 141.1 to 208.1.
+ 12. width:                    continuous from 60.3 to 72.3.
+ 13. height:                   continuous from 47.8 to 59.8.
+ 14. curb-weight:              continuous from 1488 to 4066.
+ 15. engine-type:              dohc, dohcv, l, ohc, ohcf, ohcv, rotor.
+ 16. num-of-cylinders:         eight, five, four, six, three, twelve, two.
+ 17. engine-size:              continuous from 61 to 326.
+ 18. fuel-system:              1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi.
+ 19. bore:                     continuous from 2.54 to 3.94.
+ 20. stroke:                   continuous from 2.07 to 4.17.
+ 21. compression-ratio:        continuous from 7 to 23.
+ 22. horsepower:               continuous from 48 to 288.
+ 23. peak-rpm:                 continuous from 4150 to 6600.
+ 24. city-mpg:                 continuous from 13 to 49.
+ 25. highway-mpg:              continuous from 16 to 54.
+ 26. price:                    continuous from 5118 to 45400.
+
+8. Missing Attribute Values: (denoted by "?")
+   Attribute #:   Number of instances missing a value:
+   2.             41
+   6.             2
+   19.            4
+   20.            4
+   22.            2
+   23.            2
+   26.            4
+
+
+
+
diff --git a/index.js b/index.js
@@ -0,0 +1,47 @@
+/*globals require */
+
+var sylvester = require('sylvester'),
+    Matrix = sylvester.Matrix,
+    Vector = sylvester.Vector;
+
+var LinearRegression = function(X, Y, options) {
+  this.X = X || [];
+  this.Y = Y || [];
+  this.options = options || {};
+
+  // verify that X is an array
+  if (X && !Array.isArray(X)) {
+    throw new Error('X must be an array');
+  }
+
+  // verify that Y is an array
+  if (Y && !Array.isArray(Y)) {
+    throw new Error('Y must be an array');
+  }
+};
+
+LinearRegression.prototype.train = function(callback) {
+  if (this.X.length === 0) {
+    return callback(new Error('X is empty'));
+  } else if (this.Y.length === 0) {
+    return callback(new Error('Y is empty'));
+  }
+
+  // normal equation using sylvester
+  var xWithOnes = [];
+  this.X.forEach(function(xi) {
+    xWithOnes.push([1, xi]);
+  });
+  var x = $M(xWithOnes);
+  var y = $M(this.Y);
+  this.theta = x.transpose().x(x).inverse().x(x.transpose()).x(y);
+  return callback();
+};
+
+LinearRegression.prototype.predict = function(input) {
+  var xInput = $M([1, input]);
+  var output = this.theta.transpose().x(xInput);
+  return output.e(1,1);
+};
+
+exports.LinearRegression = LinearRegression;
diff --git a/package.json b/package.json
@@ -0,0 +1,27 @@
+{
+  "name": "shaman",
+  "description": "Machine Learning library for node.js",
+  "keywords": ["machine learning", "linear regression", "statistics", "gradient descent algorithm"],
+  "version": "0.0.1",
+  "author": "Luc Castera <luc.castera@gmail.com>",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "http://github.com/dambalah/shaman.git"
+  },
+  "main": "index.js",
+  "dependencies": {
+    "sylvester": "0.0.21"
+  },
+  "devDependencies": {
+    "mocha": "1.21.5",
+    "jshint": "2.5.6",
+    "csv-parse": "0.0.6",
+    "plotly": "0.2.13",
+    "underscore": "1.7.0"
+  },
+  "scripts": {
+    "test": "./node_modules/mocha/bin/mocha -R spec spec.js",
+    "jshint": "./node_modules/jshint/bin/jshint index.js spec.js"
+  }
+}
diff --git a/spec.js b/spec.js
@@ -0,0 +1,66 @@
+/*globals describe, require */
+
+var LinearRegression = require('./index').LinearRegression,
+    assert = require('assert');
+
+describe('LinearRegresssion', function() {
+  describe('initialization', function() {
+    it('can be initialized with no parameters', function(done) {
+      var lr = new LinearRegression();
+      assert.ok(lr);
+      done();
+    });
+
+    it('should throw an error if X is not an array', function(done) {
+      var x = 'a string';
+      assert.throws(function() {
+        var lr = new LinearRegression(x);
+      }, Error);
+      done();
+    });
+
+    it('should throw an error if Y is not an array', function(done) {
+      var x = [1];
+      var y = 'a string';
+      assert.throws(function() {
+        var lr = new LinearRegression(x, y);
+      }, Error);
+      done();
+    });
+  });
+
+  describe('train', function() {
+    it('should throw an error if there is no data in X', function(done) {
+      var lr = new LinearRegression();
+      lr.train(function(err) {
+        assert.ok(err);
+        assert.equal(err.message, 'X is empty');
+        done();
+      });
+    });
+    it('should throw an error if there is no data in Y', function(done) {
+      var lr = new LinearRegression([0,1,2,3]);
+      lr.train(function(err) {
+        assert.ok(err);
+        assert.equal(err.message, 'Y is empty');
+        done();
+      });
+    });
+  });
+
+  describe('predict', function() {
+    it('should predict a simple example correctly', function(done) {
+      var lr = new LinearRegression([1, 2, 3, 4, 5], [2, 2, 3, 3, 5]);
+      lr.train(function(err) {
+        assert.ok(lr.predict(0) - 0.899 < 0.01);
+        assert.ok(lr.predict(1) - 1.599 < 0.01);
+        assert.ok(lr.predict(2) - 2.3 < 0.01);
+        assert.ok(lr.predict(3) - 2.999 < 0.01);
+        assert.ok(lr.predict(4) - 3.699 < 0.01);
+        assert.ok(lr.predict(5) - 4.4 < 0.01);
+        assert.ok(lr.predict(10) - 7.9 < 0.01);
+        done();
+      });
+    });
+  });
+});