Skip to content

Commit

Permalink
Initial commit; Only supports simple linear regression for now
Browse files Browse the repository at this point in the history
  • Loading branch information
luccastera committed Oct 19, 2014
0 parents commit 3915a07
Show file tree
Hide file tree
Showing 9 changed files with 558 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
.DS_Store
node_modules
4 changes: 4 additions & 0 deletions History.md
@@ -0,0 +1,4 @@
0.0.1 / 2014-10-19
==================

* Initial release
25 changes: 25 additions & 0 deletions README.md
@@ -0,0 +1,25 @@
# shaman

Machine Learning library for node.js

## Linear Regression

Linear Regression is implemented using linear algebra and the normal equation.

### Usage

```javascript
var X = [1, 2, 3, 4, 5];
var Y = [2, 2, 3, 3, 5];
var lr = new LinearRegression(X,Y);
lr.train(function(err) {
if (err) { throw err; }

// you can now start using lr.predict:
console.log(lr.predict(1));
});

### Example

[Click here](https://plot.ly/~luccastera/2) to see an example of Simple Linear Regression
to evaluate the price of cars based on their horsepower that was done with the shaman library.
205 changes: 205 additions & 0 deletions examples/cars.data

Large diffs are not rendered by default.

79 changes: 79 additions & 0 deletions examples/cars.js
@@ -0,0 +1,79 @@
var csvParse = require('csv-parse'),
fs = require('fs'),
LinearRegression = require('../index').LinearRegression,
_ = require('underscore');

var apiKey = process.env.PLOTLY_API_KEY;
var username = process.env.PLOTLY_USERNAME;
var plotly = require('plotly')(username,apiKey);

fs.readFile('./examples/cars.data', 'utf8', function(err, dataStr) {
if (err) {
console.log(err);
process.exit(1);
}
csvParse(dataStr, {delimiter: ',', auto_parse: true}, function(err, data) {
// First, clean up the training data by eliminating rows that have invalid values
var cleanData = _.filter(data, function(d) { return typeof d[0] === 'number' && typeof d[1] === 'number'; });

// We are only going two columns:
// x: horsepower
// y: price of car
var xAndY = cleanData.map(function(h) { return [h[21], h[25]]; });
var x = cleanData.map(function(h) { return h[21]; }); // x is horsepower
var y = cleanData.map(function(h) { return h[25]; }); // y is price


// Initialize and train the linear regression
var lr = new LinearRegression(x, y);
lr.train(function(err) {
if (err) {
console.log('error', err);
process.exit(2);
}

// Use the linear regression function to get a set of data to graph the linear regression line
var y2 = [];
x.forEach(function(xi) {
y2.push(lr.predict(xi));
});

// Create scatter plots of training data + linear regression function
var layout = {
title: 'Car Prices vs Horsepower',
xaxis: {
title: 'Horsepower'
},
yaxis: {
title: 'Price in $'
}
};
var trace1 = {
x: x,
y: y,
name: 'Training Data',
mode: "markers",
type: "scatter"
};
var trace2 = {
x: x,
y: y2,
name: 'Linear Regression',
mode: "lines",
type: "scatter"
};
var plotData = [trace1, trace2];
var graphOptions = {layout: layout,filename: "cars-linear-regression-with-shaman", fileopt: "overwrite"}
plotly.plot(plotData, graphOptions, function (err, msg) {
if (err) {
console.log(err);
process.exit(3);
} else {
console.log('Success! The plot (' + msg.filename + ') can be found at ' + msg.url);
process.exit();
}
});
});
});
});

103 changes: 103 additions & 0 deletions examples/cars.names
@@ -0,0 +1,103 @@
1. Title: 1985 Auto Imports Database

2. Source Information:
-- Creator/Donor: Jeffrey C. Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)
-- Date: 19 May 1987
-- Sources:
1) 1985 Model Import Car and Truck Specifications, 1985 Ward's
Automotive Yearbook.
2) Personal Auto Manuals, Insurance Services Office, 160 Water
Street, New York, NY 10038
3) Insurance Collision Report, Insurance Institute for Highway
Safety, Watergate 600, Washington, DC 20037

3. Past Usage:
-- Kibler,~D., Aha,~D.~W., \& Albert,~M. (1989). Instance-based prediction
of real-valued attributes. {\it Computational Intelligence}, {\it 5},
51--57.
-- Predicted price of car using all numeric and Boolean attributes
-- Method: an instance-based learning (IBL) algorithm derived from a
localized k-nearest neighbor algorithm. Compared with a
linear regression prediction...so all instances
with missing attribute values were discarded. This resulted with
a training set of 159 instances, which was also used as a test
set (minus the actual instance during testing).
-- Results: Percent Average Deviation Error of Prediction from Actual
-- 11.84% for the IBL algorithm
-- 14.12% for the resulting linear regression equation

4. Relevant Information:
-- Description
This data set consists of three types of entities: (a) the
specification of an auto in terms of various characteristics, (b)
its assigned insurance risk rating, (c) its normalized losses in use
as compared to other cars. The second rating corresponds to the
degree to which the auto is more risky than its price indicates.
Cars are initially assigned a risk factor symbol associated with its
price. Then, if it is more risky (or less), this symbol is
adjusted by moving it up (or down) the scale. Actuarians call this
process "symboling". A value of +3 indicates that the auto is
risky, -3 that it is probably pretty safe.

The third factor is the relative average loss payment per insured
vehicle year. This value is normalized for all autos within a
particular size classification (two-door small, station wagons,
sports/speciality, etc...), and represents the average loss per car
per year.

-- Note: Several of the attributes in the database could be used as a
"class" attribute.

5. Number of Instances: 205

6. Number of Attributes: 26 total
-- 15 continuous
-- 1 integer
-- 10 nominal

7. Attribute Information:
Attribute: Attribute Range:
------------------ -----------------------------------------------
1. symboling: -3, -2, -1, 0, 1, 2, 3.
2. normalized-losses: continuous from 65 to 256.
3. make: alfa-romero, audi, bmw, chevrolet, dodge, honda,
isuzu, jaguar, mazda, mercedes-benz, mercury,
mitsubishi, nissan, peugot, plymouth, porsche,
renault, saab, subaru, toyota, volkswagen, volvo
4. fuel-type: diesel, gas.
5. aspiration: std, turbo.
6. num-of-doors: four, two.
7. body-style: hardtop, wagon, sedan, hatchback, convertible.
8. drive-wheels: 4wd, fwd, rwd.
9. engine-location: front, rear.
10. wheel-base: continuous from 86.6 120.9.
11. length: continuous from 141.1 to 208.1.
12. width: continuous from 60.3 to 72.3.
13. height: continuous from 47.8 to 59.8.
14. curb-weight: continuous from 1488 to 4066.
15. engine-type: dohc, dohcv, l, ohc, ohcf, ohcv, rotor.
16. num-of-cylinders: eight, five, four, six, three, twelve, two.
17. engine-size: continuous from 61 to 326.
18. fuel-system: 1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi.
19. bore: continuous from 2.54 to 3.94.
20. stroke: continuous from 2.07 to 4.17.
21. compression-ratio: continuous from 7 to 23.
22. horsepower: continuous from 48 to 288.
23. peak-rpm: continuous from 4150 to 6600.
24. city-mpg: continuous from 13 to 49.
25. highway-mpg: continuous from 16 to 54.
26. price: continuous from 5118 to 45400.

8. Missing Attribute Values: (denoted by "?")
Attribute #: Number of instances missing a value:
2. 41
6. 2
19. 4
20. 4
22. 2
23. 2
26. 4




47 changes: 47 additions & 0 deletions index.js
@@ -0,0 +1,47 @@
/*globals require */

var sylvester = require('sylvester'),
Matrix = sylvester.Matrix,
Vector = sylvester.Vector;

var LinearRegression = function(X, Y, options) {
this.X = X || [];
this.Y = Y || [];
this.options = options || {};

// verify that X is an array
if (X && !Array.isArray(X)) {
throw new Error('X must be an array');
}

// verify that Y is an array
if (Y && !Array.isArray(Y)) {
throw new Error('Y must be an array');
}
};

LinearRegression.prototype.train = function(callback) {
if (this.X.length === 0) {
return callback(new Error('X is empty'));
} else if (this.Y.length === 0) {
return callback(new Error('Y is empty'));
}

// normal equation using sylvester
var xWithOnes = [];
this.X.forEach(function(xi) {
xWithOnes.push([1, xi]);
});
var x = $M(xWithOnes);
var y = $M(this.Y);
this.theta = x.transpose().x(x).inverse().x(x.transpose()).x(y);
return callback();
};

LinearRegression.prototype.predict = function(input) {
var xInput = $M([1, input]);
var output = this.theta.transpose().x(xInput);
return output.e(1,1);
};

exports.LinearRegression = LinearRegression;
27 changes: 27 additions & 0 deletions package.json
@@ -0,0 +1,27 @@
{
"name": "shaman",
"description": "Machine Learning library for node.js",
"keywords": ["machine learning", "linear regression", "statistics", "gradient descent algorithm"],
"version": "0.0.1",
"author": "Luc Castera <luc.castera@gmail.com>",
"license": "MIT",
"repository": {
"type": "git",
"url": "http://github.com/dambalah/shaman.git"
},
"main": "index.js",
"dependencies": {
"sylvester": "0.0.21"
},
"devDependencies": {
"mocha": "1.21.5",
"jshint": "2.5.6",
"csv-parse": "0.0.6",
"plotly": "0.2.13",
"underscore": "1.7.0"
},
"scripts": {
"test": "./node_modules/mocha/bin/mocha -R spec spec.js",
"jshint": "./node_modules/jshint/bin/jshint index.js spec.js"
}
}
66 changes: 66 additions & 0 deletions spec.js
@@ -0,0 +1,66 @@
/*globals describe, require */

var LinearRegression = require('./index').LinearRegression,
assert = require('assert');

describe('LinearRegresssion', function() {
describe('initialization', function() {
it('can be initialized with no parameters', function(done) {
var lr = new LinearRegression();
assert.ok(lr);
done();
});

it('should throw an error if X is not an array', function(done) {
var x = 'a string';
assert.throws(function() {
var lr = new LinearRegression(x);
}, Error);
done();
});

it('should throw an error if Y is not an array', function(done) {
var x = [1];
var y = 'a string';
assert.throws(function() {
var lr = new LinearRegression(x, y);
}, Error);
done();
});
});

describe('train', function() {
it('should throw an error if there is no data in X', function(done) {
var lr = new LinearRegression();
lr.train(function(err) {
assert.ok(err);
assert.equal(err.message, 'X is empty');
done();
});
});
it('should throw an error if there is no data in Y', function(done) {
var lr = new LinearRegression([0,1,2,3]);
lr.train(function(err) {
assert.ok(err);
assert.equal(err.message, 'Y is empty');
done();
});
});
});

describe('predict', function() {
it('should predict a simple example correctly', function(done) {
var lr = new LinearRegression([1, 2, 3, 4, 5], [2, 2, 3, 3, 5]);
lr.train(function(err) {
assert.ok(lr.predict(0) - 0.899 < 0.01);
assert.ok(lr.predict(1) - 1.599 < 0.01);
assert.ok(lr.predict(2) - 2.3 < 0.01);
assert.ok(lr.predict(3) - 2.999 < 0.01);
assert.ok(lr.predict(4) - 3.699 < 0.01);
assert.ok(lr.predict(5) - 4.4 < 0.01);
assert.ok(lr.predict(10) - 7.9 < 0.01);
done();
});
});
});
});

0 comments on commit 3915a07

Please sign in to comment.