Skip to content

Commit

Permalink
Merge pull request #88 from dooho-h/fix-rowspan-offset
Browse files Browse the repository at this point in the history
Fix Offset Calculation in Tables with Complex Rowspans
  • Loading branch information
maugenst committed Feb 8, 2024
2 parents 7be53cc + a16b923 commit 7cfb73e
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 23 deletions.
65 changes: 42 additions & 23 deletions lib/Tabletojson.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ export type TableToJsonOptions = {

export type CallbackFunction = (conversionResult: any) => any;

type RowSpan = {content: string; value: number} | null;

export class Tabletojson {
static convert(
html: string,
Expand Down Expand Up @@ -208,9 +210,10 @@ export class Tabletojson {
const cells: cheerio.Cheerio = options.useFirstRowForHeadings
? $(row).find('td, th')
: $(row).find('th');
cells.each((j: number, cell: cheerio.Element) => {
if (options.onlyColumns && !options.onlyColumns.includes(j)) return;
if (options.ignoreColumns && !options.onlyColumns && options.ignoreColumns.includes(j)) return;
cells.each((cellIndex: number, cell: cheerio.Element) => {
if (options.onlyColumns && !options.onlyColumns.includes(cellIndex)) return;
if (options.ignoreColumns && !options.onlyColumns && options.ignoreColumns.includes(cellIndex))
return;
let value: string = '';

if (options.headings) {
Expand All @@ -230,15 +233,15 @@ export class Tabletojson {
const seen: any = alreadySeen[value];
if (seen && options.countDuplicateHeadings) {
suffix = ++alreadySeen[value];
columnHeadings[j] = value !== '' ? `${value}_${suffix}` : `${j}`;
columnHeadings[cellIndex] = value !== '' ? `${value}_${suffix}` : `${cellIndex}`;
} else {
alreadySeen[value] = 1;
columnHeadings[j] = value;
columnHeadings[cellIndex] = value;
}
});
});

let rowspans: any[] = [];
let rowspans: RowSpan[] = [];

// Fetch each row
$(table)
Expand All @@ -255,19 +258,19 @@ export class Tabletojson {
}

// Add content from rowspans
rowspans.forEach((rowspan: any, index: number) => {
rowspans.forEach((rowspan, index) => {
if (!rowspan) return;

setColumn(index, rowspan.content);

rowspan.value--;
});
const nextrowspans: any[] = [...rowspans];
const nextrowspans = [...rowspans];

const cells: cheerio.Cheerio = options.useFirstRowForHeadings
? $(row).find('td, th')
: $(row).find('td');
cells.each((j: number, cell: cheerio.Element) => {
cells.each((cellIndex: number, cell: cheerio.Element) => {
// ignoreHiddenRows
if (options.ignoreHiddenRows) {
const style: string | undefined = $(row).attr('style');
Expand All @@ -278,18 +281,15 @@ export class Tabletojson {
}

// Apply rowspans offsets
let aux: number = j;
j = 0;
do {
while (rowspans[j]) j++;
while (aux && !rowspans[j]) {
j++;
aux--;
}
} while (aux);
const adjustedIndex = applyOffsets(cellIndex, rowspans);

if (options.onlyColumns && !options.onlyColumns.includes(j)) return;
if (options.ignoreColumns && !options.onlyColumns && options.ignoreColumns.includes(j)) return;
if (options.onlyColumns && !options.onlyColumns.includes(adjustedIndex)) return;
if (
options.ignoreColumns &&
!options.onlyColumns &&
options.ignoreColumns.includes(adjustedIndex)
)
return;

const cheerioCell: cheerio.Cheerio = $(cell);
const cheerioCellText: string = cheerioCell.text();
Expand All @@ -302,15 +302,15 @@ export class Tabletojson {
? cheerioCellHtml.trim()
: '';

setColumn(j, content);
setColumn(adjustedIndex, content);

// Check rowspan
const value: number = cheerioCellRowspan ? parseInt(cheerioCellRowspan, 10) - 1 : 0;
if (value > 0) nextrowspans[j] = {content, value};
if (value > 0) nextrowspans[adjustedIndex] = {content, value};
});

rowspans = nextrowspans;
rowspans.forEach((rowspan: any, index: number) => {
rowspans.forEach((rowspan, index) => {
if (rowspan && rowspan.value === 0) rowspans[index] = null;
});

Expand Down Expand Up @@ -403,4 +403,23 @@ export class Tabletojson {
}
}
}

const applyOffsets = (cellIndex: number, rowspans: RowSpan[]) => {
let nullCount = 0;

for (let i = 0; i < rowspans.length; i++) {
if (rowspans[i]) {
continue;
}

if (nullCount === cellIndex) {
return i;
}

nullCount++;
}

return cellIndex + rowspans.length - nullCount;
};

export {Tabletojson as tabletojson};
47 changes: 47 additions & 0 deletions test/tables.html
Original file line number Diff line number Diff line change
Expand Up @@ -1390,6 +1390,53 @@ <h2>Table #12: Table with complex rowspans</h2>
</tbody>
</table>

<h2>Table #12-a: Table with extremely complex rowspans</h2>
<table id="table12-a" class="table" border="1">
<thead>
<tr>
<th>Department</th>
<th>Major</th>
<th>Class</th>
<th>Instructor</th>
<th>Credit</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="4">Engineering</td>
<td rowspan="3">Computer Science</td>
<td>CS101</td>
<td>Kim</td>
<td rowspan="2">3</td>
</tr>
<tr>
<td>CS201</td>
<td rowspan="2">Garcia</td>
</tr>
<tr>
<td>CS303</td>
<td>2</td>
</tr>
<tr>
<td>Electrical Engineering</td>
<td>EE101</td>
<td>Müller</td>
<td>3</td>
</tr>
<tr>
<td rowspan="2">Social Science</td>
<td rowspan="2">Economics</td>
<td>EC101</td>
<td>Nguyen</td>
<td rowspan="2">3</td>
</tr>
<tr>
<td>EC401</td>
<td>Smith</td>
</tr>
</tbody>
</table>

<h2>Table #13: Table with no headers</h2>
<table id="table13" class="table" border="1">
<tbody>
Expand Down
46 changes: 46 additions & 0 deletions test/tabletojsonLocal.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,52 @@ describe('TableToJSON Local', function () {
expect(table[4].Age).toBe('17');
});

it('Complex rowspan usage leads to correct object representation', async function () {
const converted = tabletojson.convert(html, {
id: ['table12-a'],
});
expect(converted).toBeDefined();
expect(converted.length).toBe(1);
const table = converted[0];

expect(table.length).toBe(6);

expect(table[0].Department).toBe('Engineering');
expect(table[1].Department).toBe('Engineering');
expect(table[2].Department).toBe('Engineering');
expect(table[3].Department).toBe('Engineering');
expect(table[4].Department).toBe('Social Science');
expect(table[5].Department).toBe('Social Science');

expect(table[0].Major).toBe('Computer Science');
expect(table[1].Major).toBe('Computer Science');
expect(table[2].Major).toBe('Computer Science');
expect(table[3].Major).toBe('Electrical Engineering');
expect(table[4].Major).toBe('Economics');
expect(table[5].Major).toBe('Economics');

expect(table[0].Class).toBe('CS101');
expect(table[1].Class).toBe('CS201');
expect(table[2].Class).toBe('CS303');
expect(table[3].Class).toBe('EE101');
expect(table[4].Class).toBe('EC101');
expect(table[5].Class).toBe('EC401');

expect(table[0].Instructor).toBe('Kim');
expect(table[1].Instructor).toBe('Garcia');
expect(table[2].Instructor).toBe('Garcia');
expect(table[3].Instructor).toBe('Müller');
expect(table[4].Instructor).toBe('Nguyen');
expect(table[5].Instructor).toBe('Smith');

expect(table[0].Credit).toBe('3');
expect(table[1].Credit).toBe('3');
expect(table[2].Credit).toBe('2');
expect(table[3].Credit).toBe('3');
expect(table[4].Credit).toBe('3');
expect(table[5].Credit).toBe('3');
});

it('Options: containsClasses', async function () {
const converted = tabletojson.convert(html, {
containsClasses: ['table'],
Expand Down

0 comments on commit 7cfb73e

Please sign in to comment.