In [1]:
//default_exp data

# Data

> Data loading and processing functions.

In [2]:
/**
Imports we need in data.module.js
*/
import {argmax} from './src/util.module.js';

In [3]:
// Imports we need for testing
import {shape} from './src/util.module.js';
import {testEq} from './src/testutil.module.js'

## Load

In [4]:
/**
Log the first `rows` of an array.
*/
function head(data,rows=10) {
    rows=Math.min(rows,data.length);
    for (let i=0; i<rows; i++) {
        console.log(i, data[i]);
    }
}

/**
Log the last `rows` of an array.
*/
function tail(data,rows=10) {
    rows=Math.min(rows,data.length);
    for (let i=-rows; i<0; i++) {
        console.log(i, data[data.length+i]);
    }
}

In [5]:
/**
Parse simple csv formatted strings.
*/
class RowHandler {
    constructor() {
        this.result=[]
    }
    handleRow(row,i) {
        this.result.push(row.split(','));
    }
}

function parseCsv(stringData, rowHandler, rowLimit) {
    if (rowHandler == null) {
        rowHandler = new RowHandler()
    }
    
    const rows=stringData.split('\n');
    if (rowLimit==null) {
        rowLimit=rows.length;
    }
    for (let i=0; i<rowLimit; i++) {
        const row=rows[i];
        if (row !== '') {
            rowHandler.handleRow(row,i);
        }
    }
    return rowHandler;
}

In [6]:
let stringData=require('fs').readFileSync('data/iris.data').toString();
let data=parseCsv(stringData).result;
head(data,5)
tail(data,5)

0 [ '5.1', '3.5', '1.4', '0.2', 'Iris-setosa' ]
1 [ '4.9', '3.0', '1.4', '0.2', 'Iris-setosa' ]
2 [ '4.7', '3.2', '1.3', '0.2', 'Iris-setosa' ]
3 [ '4.6', '3.1', '1.5', '0.2', 'Iris-setosa' ]
4 [ '5.0', '3.6', '1.4', '0.2', 'Iris-setosa' ]
-5 [ '6.7', '3.0', '5.2', '2.3', 'Iris-virginica' ]
-4 [ '6.3', '2.5', '5.0', '1.9', 'Iris-virginica' ]
-3 [ '6.5', '3.0', '5.2', '2.0', 'Iris-virginica' ]
-2 [ '6.2', '3.4', '5.4', '2.3', 'Iris-virginica' ]
-1 [ '5.9', '3.0', '5.1', '1.8', 'Iris-virginica' ]


We can use `IrisRowHandler` &darr; and `parseCsv` &uarr; to prepare [iris.data](https://archive.ics.uci.edu/ml/datasets/iris) for learning.

In [7]:
/**
Convert a row of the iris dataset from string values to numbers (for input features) targets.
*/
const IRIS_CLASS_MAP = {
    0: 'Iris-setosa',
    'Iris-setosa-onehot': [1,0,0],
    'Iris-setosa-classid': 0,
    1: 'Iris-versicolor',
    'Iris-versicolor-onehot': [0,1,0],
    'Iris-versicolor-classid': 1,
    2: 'Iris-virginica',
    'Iris-virginica-onehot': [0,0,1],
    'Iris-virginica-classid': 2
};
class IrisRowHandler {
    constructor(targetType) {
        this.targetType = (targetType==null) ? 'onehot' : targetType;
        this.result=[[],[]];
    }
    normalize(row) {
        return [
            (row[0]-5.843333333)/0.828066128,
            (row[1]-3.054)/0.433594311,
            (row[2]-3.758666667)/1.76442042,
            (row[3]-1.198666667)/0.763160742
        ];
    }
    handleRow(row) {
        row = row.split(',');
        // convert datatypes and normalize input features
        this.result[0].push(this.normalize(row.slice(0,4).map(a=>parseFloat(a))));
        this.result[1].push(IRIS_CLASS_MAP[`${row[4]}-${this.targetType}`]);
    }
}

In [8]:
let rh=new IrisRowHandler();
rh.handleRow('5.1,3.5,1.4,0.2,Iris-setosa');
testEq([1,4],shape(rh.result[0]));
testEq([[1,0,0]],rh.result[1]);

In [9]:
let rowHandler=parseCsv(stringData, new IrisRowHandler(), 3);
let data=rowHandler.result;
head(data[0]) // x
head(data[1]) // y
tail(data[0]) // x
tail(data[1]) // y

0 [
  -0.8976738787702239,
  1.0286112817564161,
  -1.3367940204410014,
  -1.3085928193617695
]
1 [
  -1.1392004830319542,
  -0.12454037940548492,
  -1.3367940204410014,
  -1.3085928193617695
]
2 [
  -1.3807270872936854,
  0.3367202850592759,
  -1.3934698551040348,
  -1.3085928193617695
]
0 [ 1, 0, 0 ]
1 [ 1, 0, 0 ]
2 [ 1, 0, 0 ]
-3 [
  -0.8976738787702239,
  1.0286112817564161,
  -1.3367940204410014,
  -1.3085928193617695
]
-2 [
  -1.1392004830319542,
  -0.12454037940548492,
  -1.3367940204410014,
  -1.3085928193617695
]
-1 [
  -1.3807270872936854,
  0.3367202850592759,
  -1.3934698551040348,
  -1.3085928193617695
]
-3 [ 1, 0, 0 ]
-2 [ 1, 0, 0 ]
-1 [ 1, 0, 0 ]


## Shuffle

The following `shuffle` function is borrowed from https://bost.ocks.org/mike/shuffle/ - modified to shuffle multiple arrays in the same way.

In [10]:
/**
Shuffle any number of arrays in the same way.
*/
function shuffle(arrays) {
    var m = arrays[0].length, t, i;
    // While there remain elements to shuffle…
    while (m) {
        // Pick a remaining element…
        i = Math.floor(Math.random() * m--);
        // And swap it with the current element.
        arrays.forEach(array => {
            t = array[m];
            array[m] = array[i];
            array[i] = t;
        });
    }
    return arrays;
}

In [11]:
let a=[],b=[];
for(let i=0; i<1000; i++) {
    a.push(i);
    b.push(i*10);
}
shuffle([a,b])
b=b.map(x=>x/10)
testEq(a,b)

## Split

In [12]:
/**
Split any number of arrays returning [100-`percent`, `percent`] for each array.
*/
function split(arrays, percent=0.2) {
    const result=[];
    arrays.forEach(array => {
        const splitPos=Math.round(arrays[0].length*(1.0-percent));
        result.push([array.slice(0,splitPos), array.slice(splitPos)]);
    });
    return result;
}

Note: In the test below `let [[xTrain,xValid],[yTrain,yValid]]=split(data);` uses destructuring to assign to `xTrain`, `xValid` etc. Without destructuring, we would do the same thing with ...
```
let xTrain=splitData[0][0];
let xValid=splitData[0][1];
let yTrain=splitData[1][0];
let yValid=splitData[1][1];
```

In [13]:
let data=parseCsv(stringData, new IrisRowHandler(), 9).result;
let [[xTrain,xValid],[yTrain,yValid]]=split(data);
testEq([7,4], shape(xTrain));
testEq([2,4], shape(xValid));
testEq([7,3], shape(yTrain));
testEq([2,3], shape(yValid));

Note: `[...Array(10).keys()]` would be `list(range(10))` in python

In [14]:
let data=[[...Array(10).keys()]];
let [[xTrain,xValid]]=split(data, .3);
testEq([0,1,2,3,4,5,6], xTrain);
testEq([7,8,9], xValid);
shuffle(data);
// not sure how to test that this got shuffled properly, so we'll just print it
data

[ [
    9, 8, 5, 2, 6,
    1, 4, 3, 7, 0
  ] ]


In [15]:
// then print the split result
split(data)

[ [ [
      9, 8, 5, 2,
      6, 1, 4, 3
    ], [ 7, 0 ] ] ]


## Batch

In [16]:
/**
Shuffle any number of arrays then put them into an array of batches.
*/
function batches(arrays, bs=64, dropLast=false, shuffleArrays=true) {
    if (shuffleArrays) {
        shuffle(arrays);
    }
    const result=[];
    let batchCount=(dropLast) ? Math.floor(arrays[0].length/bs) : arrays[0].length/bs;
    for (let i=0; i<batchCount; i++) {
        const batch=[];
        result.push(batch);
        arrays.forEach(array=>batch.push(array.slice(bs*i,bs*(i+1))));
    }
    return result;
}

In [17]:
let data=parseCsv(stringData, new IrisRowHandler(), 10).result;
let batchesOfData=batches(data,3)
testEq(4, batchesOfData.length);
testEq(2, batchesOfData[0].length);
testEq([3,4], shape(batchesOfData[0][0])); // 1st batch has 3 items
testEq([3,3], shape(batchesOfData[0][1]));
testEq(2, batchesOfData[3].length);
testEq([1,4], shape(batchesOfData[3][0])); // last batch has 1 item
testEq([1,3], shape(batchesOfData[3][1]));
batchesOfData=batches(data,3,true)
testEq(3, batchesOfData.length);
testEq(2, batchesOfData[0].length);
testEq([3,4], shape(batchesOfData[2][0])); // last batch has 3 items
testEq([3,3], shape(batchesOfData[2][1]));

In [18]:
// test the no-shuffle option
let data=[[0,1,2],[3,4,5]];
testEq([[[0,1,2],[3,4,5]]],batches(data,64,false,false));

In [19]:
export {head,tail,parseCsv,IRIS_CLASS_MAP,RowHandler,IrisRowHandler,shuffle,split,batches}