Skip to content

Commit

Permalink
Added Check on Kanji/Hiragana... conversion. Added containsClasses op…
Browse files Browse the repository at this point in the history
…tion.
  • Loading branch information
maugenst committed Jun 7, 2018
1 parent d96f48c commit e55581b
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 4 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,9 @@ are given than existing values the overcounting values are ignored.
```

### containsClasses
Array of classes to find a specific table using this class. Default is 'null/undefined'.

## Known issues and limitations

This module only supports parsing basic tables with a simple horizontal set of <th></th> headings and
Expand Down Expand Up @@ -464,7 +467,7 @@ tabletojson.convertUrl(url)

# Issues

Right now the table needs to be "well formatted" to be convertable. Tables in Html pages with not be
Right now the table needs to be "well formatted" to be convertable. Tables in tables with not be
processed.

```html
Expand Down
14 changes: 12 additions & 2 deletions lib/tabletojson.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class tabletojson {
* @param options.onlyColumns <Array> Array of column indices to be used. Overrides ignoreColumn [default=null]
* @param options.ignoreHiddenRows Ignoring hidden rows [default=true]
* @param options.headings <Array> Array of Strings to be used as headings [default=null]
* @param options.headings <Array> Array of classes to find a specific table [default=null]
* @return {Object} Converted Object as an object literal
*/
static convert(html, options) {
Expand All @@ -32,7 +33,8 @@ class tabletojson {
ignoreColumns: null,
onlyColumns: null,
ignoreHiddenRows: true,
headings: null
headings: null,
containsClasses: null
},
options
);
Expand All @@ -50,7 +52,9 @@ class tabletojson {

const $ = cheerio.load(html);

$('table').each(function(i, table) {
const additionalSelectors = options.containsClasses ? `.${options.containsClasses.join('.')}` : '';

$(`table${additionalSelectors}`).each(function(i, table) {
const tableAsJson = [];
const alreadySeen = {};
// Get column headings
Expand Down Expand Up @@ -149,6 +153,12 @@ class tabletojson {
* @param arg1.stripHtmlFromCells Strip HTML from cells [default=true]
* @param arg1.stripHtml Strip off HTML [default=null] if set true stripHtmlFromHeadings and stripHtmlFromCells will also be true
* @param arg1.forceIndexAsNumber Force the index to be used as number [default=false]
* @param arg1.countDuplicateHeadings If given a _<NUMBER> will be added to the duplicate key [default=false]
* @param arg1.ignoreColumns <Array> Array of column indices to ignored [default=null]
* @param arg1.onlyColumns <Array> Array of column indices to be used. Overrides ignoreColumn [default=null]
* @param arg1.ignoreHiddenRows Ignoring hidden rows [default=true]
* @param arg1.headings <Array> Array of Strings to be used as headings [default=null]
* @param arg1.headings <Array> Array of classes to find a specific table [default=null]
* @param arg1.request Options to be passed to request object
* @param arg2 Callback function to be called when the conversion finished
* @return {Promise<*>} Promise containing the result
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
},
"name": "tabletojson",
"description": "Converts HTML tables to JSON objects",
"version": "0.9.3",
"version": "0.9.4",
"main": "./lib/tabletojson.js",
"keywords": [
"table2json",
Expand Down
41 changes: 41 additions & 0 deletions test/tables.html
Original file line number Diff line number Diff line change
Expand Up @@ -218,5 +218,46 @@ <h2>Table #6: Table for conversion using options 'ignoreColumns' and 'onlyColumn
</tbody>
</table>

<h2>Table #7: Table using Kanji / Hiragana</h2>
<table>
<tbody>
<tr>
<th>Kanji</th>
<th>Hiragana</th>
<th>Katakana</th>
<th>Rōmaji</th>
<th>English</th>
</tr>
<tr>
<td><span lang="ja-Hani" title="Japanese language text"></span></td>
<td><span lang="ja-Hira" title="Japanese language text">わたし</span></td>
<td><span lang="ja-Kana" title="Japanese language text">ワタシ</span></td>
<td><i>watashi</i></td>
<td>I, me</td>
</tr>
<tr>
<td><span lang="ja-Hani" title="Japanese language text">金魚</span></td>
<td><span lang="ja-Hira" title="Japanese language text">きんぎょ</span></td>
<td><span lang="ja-Kana" title="Japanese language text">キンギョ</span></td>
<td><i>kingyo</i></td>
<td>goldfish</td>
</tr>
<tr>
<td><span lang="ja-Hani" title="Japanese language text">煙草</span> or <span lang="ja-Hani" title="Japanese language text"></span></td>
<td><span lang="ja-Hira" title="Japanese language text">たばこ</span></td>
<td><span lang="ja-Kana" title="Japanese language text">タバコ</span></td>
<td><i>tabako</i></td>
<td>tobacco, cigarette</td>
</tr>
<tr>
<td><span lang="ja-Hani" title="Japanese language text">東京</span></td>
<td><span lang="ja-Hira" title="Japanese language text">とうきょう</span></td>
<td><span lang="ja-Kana" title="Japanese language text">トウキョウ</span></td>
<td><i>tōkyō</i></td>
<td><a href="/wiki/Tokyo" title="Tokyo">Tokyo</a>, literally meaning "eastern capital"</td>
</tr>
</tbody>
</table>

</body>
</html>
20 changes: 20 additions & 0 deletions test/test-tabletojson-local.js
Original file line number Diff line number Diff line change
Expand Up @@ -479,4 +479,24 @@ describe('TableToJSON Local', function() {

table[2].A.should.equal('Bill');
});

// FIX/TEST: https://github.com/maugenst/tabletojson/issues/19
it('Test to check conversion and handling of Kanji, Hiragana, Katakana and latin texts', async function() {
const converted = await tabletojson.convert(html);
converted.should.be.ok();

const table = converted[6];

_.has(table[0], 'Kanji').should.be.true();
_.has(table[0], 'Hiragana').should.be.true();
_.has(table[0], 'Katakana').should.be.true();
_.has(table[0], 'Rōmaji').should.be.true();
_.has(table[0], 'English').should.be.true();

table[0]['Kanji'].should.equal('私');
table[0]['Hiragana'].should.equal('わたし');
table[0]['Katakana'].should.equal('ワタシ');
table[0]['Rōmaji'].should.equal('watashi');
table[0]['English'].should.equal('I, me');
});
});
26 changes: 26 additions & 0 deletions test/test-tabletojson-remote.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

require('should');
const config = require('config');
const _ = require('lodash');
const tabletojson = require('../lib/tabletojson');

describe('TableToJSON Remote', function() {
Expand Down Expand Up @@ -63,6 +64,31 @@ describe('TableToJSON Remote', function() {
});
});

it('Get table from wikipedia containing Kanji, Hiragana, Katakana and latin texts', async function() {
const converted = await tabletojson.convertUrl('https://en.wikipedia.org/wiki/Japanese_writing_system', {
containsClasses: ['wikitable'],
request: {
proxy: config.get('request.proxy')
}
});

converted.should.be.ok();
const table = converted[0];
(table instanceof Array).should.be.true();

_.has(table[0], 'Kanji').should.be.true();
_.has(table[0], 'Hiragana').should.be.true();
_.has(table[0], 'Katakana').should.be.true();
_.has(table[0], 'Rōmaji').should.be.true();
_.has(table[0], 'English').should.be.true();

table[0]['Kanji'].should.equal('私');
table[0]['Hiragana'].should.equal('わたし');
table[0]['Katakana'].should.equal('ワタシ');
table[0]['Rōmaji'].should.equal('watashi');
table[0]['English'].should.equal('I, me');
});

it.skip('Try to get a table from a nonexisting domain', async function() {
try {
await tabletojson.convertUrl('https://www.klhsfljkag.com/ydasdadad/adsaakhjg/jahsgajhvas.html');
Expand Down

0 comments on commit e55581b

Please sign in to comment.