diff --git a/README.md b/README.md index 045b320..0c706e1 100644 --- a/README.md +++ b/README.md @@ -388,6 +388,9 @@ are given than existing values the overcounting values are ignored. ``` +### containsClasses +Array of classes to find a specific table using this class. Default is 'null/undefined'. + ## Known issues and limitations This module only supports parsing basic tables with a simple horizontal set of headings and @@ -464,7 +467,7 @@ tabletojson.convertUrl(url) # Issues -Right now the table needs to be "well formatted" to be convertable. Tables in Html pages with not be +Right now the table needs to be "well formatted" to be convertable. Tables in tables with not be processed. ```html diff --git a/lib/tabletojson.js b/lib/tabletojson.js index 4d0ceb7..c56513b 100644 --- a/lib/tabletojson.js +++ b/lib/tabletojson.js @@ -18,6 +18,7 @@ class tabletojson { * @param options.onlyColumns Array of column indices to be used. Overrides ignoreColumn [default=null] * @param options.ignoreHiddenRows Ignoring hidden rows [default=true] * @param options.headings Array of Strings to be used as headings [default=null] + * @param options.headings Array of classes to find a specific table [default=null] * @return {Object} Converted Object as an object literal */ static convert(html, options) { @@ -32,7 +33,8 @@ class tabletojson { ignoreColumns: null, onlyColumns: null, ignoreHiddenRows: true, - headings: null + headings: null, + containsClasses: null }, options ); @@ -50,7 +52,9 @@ class tabletojson { const $ = cheerio.load(html); - $('table').each(function(i, table) { + const additionalSelectors = options.containsClasses ? `.${options.containsClasses.join('.')}` : ''; + + $(`table${additionalSelectors}`).each(function(i, table) { const tableAsJson = []; const alreadySeen = {}; // Get column headings @@ -149,6 +153,12 @@ class tabletojson { * @param arg1.stripHtmlFromCells Strip HTML from cells [default=true] * @param arg1.stripHtml Strip off HTML [default=null] if set true stripHtmlFromHeadings and stripHtmlFromCells will also be true * @param arg1.forceIndexAsNumber Force the index to be used as number [default=false] + * @param arg1.countDuplicateHeadings If given a _ will be added to the duplicate key [default=false] + * @param arg1.ignoreColumns Array of column indices to ignored [default=null] + * @param arg1.onlyColumns Array of column indices to be used. Overrides ignoreColumn [default=null] + * @param arg1.ignoreHiddenRows Ignoring hidden rows [default=true] + * @param arg1.headings Array of Strings to be used as headings [default=null] + * @param arg1.headings Array of classes to find a specific table [default=null] * @param arg1.request Options to be passed to request object * @param arg2 Callback function to be called when the conversion finished * @return {Promise<*>} Promise containing the result diff --git a/package.json b/package.json index b5c9aba..274e9dd 100644 --- a/package.json +++ b/package.json @@ -6,7 +6,7 @@ }, "name": "tabletojson", "description": "Converts HTML tables to JSON objects", - "version": "0.9.3", + "version": "0.9.4", "main": "./lib/tabletojson.js", "keywords": [ "table2json", diff --git a/test/tables.html b/test/tables.html index 6ccd0fd..256c9b2 100644 --- a/test/tables.html +++ b/test/tables.html @@ -218,5 +218,46 @@

Table #6: Table for conversion using options 'ignoreColumns' and 'onlyColumn +

Table #7: Table using Kanji / Hiragana

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KanjiHiraganaKatakanaRōmajiEnglish
わたしワタシwatashiI, me
金魚きんぎょキンギョkingyogoldfish
煙草 or たばこタバコtabakotobacco, cigarette
東京とうきょうトウキョウtōkyōTokyo, literally meaning "eastern capital"
+ \ No newline at end of file diff --git a/test/test-tabletojson-local.js b/test/test-tabletojson-local.js index ae83774..c008e39 100644 --- a/test/test-tabletojson-local.js +++ b/test/test-tabletojson-local.js @@ -479,4 +479,24 @@ describe('TableToJSON Local', function() { table[2].A.should.equal('Bill'); }); + + // FIX/TEST: https://github.com/maugenst/tabletojson/issues/19 + it('Test to check conversion and handling of Kanji, Hiragana, Katakana and latin texts', async function() { + const converted = await tabletojson.convert(html); + converted.should.be.ok(); + + const table = converted[6]; + + _.has(table[0], 'Kanji').should.be.true(); + _.has(table[0], 'Hiragana').should.be.true(); + _.has(table[0], 'Katakana').should.be.true(); + _.has(table[0], 'Rōmaji').should.be.true(); + _.has(table[0], 'English').should.be.true(); + + table[0]['Kanji'].should.equal('私'); + table[0]['Hiragana'].should.equal('わたし'); + table[0]['Katakana'].should.equal('ワタシ'); + table[0]['Rōmaji'].should.equal('watashi'); + table[0]['English'].should.equal('I, me'); + }); }); diff --git a/test/test-tabletojson-remote.js b/test/test-tabletojson-remote.js index 6298135..636102d 100644 --- a/test/test-tabletojson-remote.js +++ b/test/test-tabletojson-remote.js @@ -2,6 +2,7 @@ require('should'); const config = require('config'); +const _ = require('lodash'); const tabletojson = require('../lib/tabletojson'); describe('TableToJSON Remote', function() { @@ -63,6 +64,31 @@ describe('TableToJSON Remote', function() { }); }); + it('Get table from wikipedia containing Kanji, Hiragana, Katakana and latin texts', async function() { + const converted = await tabletojson.convertUrl('https://en.wikipedia.org/wiki/Japanese_writing_system', { + containsClasses: ['wikitable'], + request: { + proxy: config.get('request.proxy') + } + }); + + converted.should.be.ok(); + const table = converted[0]; + (table instanceof Array).should.be.true(); + + _.has(table[0], 'Kanji').should.be.true(); + _.has(table[0], 'Hiragana').should.be.true(); + _.has(table[0], 'Katakana').should.be.true(); + _.has(table[0], 'Rōmaji').should.be.true(); + _.has(table[0], 'English').should.be.true(); + + table[0]['Kanji'].should.equal('私'); + table[0]['Hiragana'].should.equal('わたし'); + table[0]['Katakana'].should.equal('ワタシ'); + table[0]['Rōmaji'].should.equal('watashi'); + table[0]['English'].should.equal('I, me'); + }); + it.skip('Try to get a table from a nonexisting domain', async function() { try { await tabletojson.convertUrl('https://www.klhsfljkag.com/ydasdadad/adsaakhjg/jahsgajhvas.html');