From ea741feb6028b80e9dbd7ad1aa930c603b620c9f Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 19 Jan 2023 15:12:53 -0800 Subject: [PATCH 1/7] Stop using {typed: true} for csv and tsv --- src/table.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/table.js b/src/table.js index 05f30469..33f87dc2 100644 --- a/src/table.js +++ b/src/table.js @@ -191,8 +191,8 @@ function sourceCache(loadSource) { const loadTableDataSource = sourceCache(async (source, name) => { if (source instanceof FileAttachment) { switch (source.mimeType) { - case "text/csv": return source.csv({typed: true}); - case "text/tab-separated-values": return source.tsv({typed: true}); + case "text/csv": return source.csv(); + case "text/tab-separated-values": return source.tsv(); case "application/json": return source.json(); case "application/x-sqlite3": return source.sqlite(); } From 558a6a6bdfa8ea5f33ad636abfde9a7001c4dbac Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 19 Jan 2023 15:14:14 -0800 Subject: [PATCH 2/7] Infer schema if none exists --- src/table.js | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 33f87dc2..7f180da3 100644 --- a/src/table.js +++ b/src/table.js @@ -543,8 +543,9 @@ export function getTypeValidator(colType) { // DuckDBClient for data arrays, too, and then we wouldn’t need our own __table // function to do table operations on in-memory data? export function __table(source, operations) { - const input = source; let {schema, columns} = source; + if (!schema) source.schema = inferSchema(source); + const input = source; let primitive = arrayIsPrimitive(source); if (primitive) source = Array.from(source, (value) => ({value})); for (const {type, operands} of operations.filter) { @@ -666,3 +667,79 @@ export function __table(source, operations) { } return source; } + +function initKey() { + return { + other: 0, + boolean: 0, + integer: 0, + number: 0, + date: 0, + string: 0, + array: 0, + object: 0, + bigint: 0, // TODO for csv, tsv? + buffer: 0 + }; +} + +function inferSchema(source) { + const schema = []; + const sampleSize = 100; + const sample = source.slice(0, sampleSize); + const typeCounts = {}; + sample.map((d) => { + for (const key in d) { + if (!typeCounts[key]) typeCounts[key] = initKey(); + // for json and sqlite, we already have some types, but for csv and tsv, all + // columns are strings here. + const type = typeof d[key]; + const value = type === "string" ? d[key]?.trim() : d[key]; + if (value === null || value === undefined || value.length === 0) + typeCounts[key]["other"]++; + else if (type !== "string") { + if (Array.isArray(value)) typeCounts[key]["array"]++; + else if (value instanceof Date) typeCounts[key]["date"]++; + else if (value instanceof ArrayBuffer) typeCounts[key]["buffer"]++; + else if (type in typeCounts[key]) typeCounts[key][type]++; // number, bigint, boolean, or object + } else { + if (value === "true" || value === "false") + typeCounts[key]["boolean"]++; + else if (!isNaN(+value) && /^-?[0-9]+$/.test(value)) + typeCounts[key]["integer"]++; + else if (!isNaN(+value)) typeCounts[key]["number"]++; + else if ( + value.match( + /^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ + ) + ) + typeCounts[key]["date"]++; + else if (value.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4}) (\d{2}):(\d{2})/)) + typeCounts[key]["date"]++; + else if (value.match(/(\d{4})-(\d{1,2})-(\d{1,2})/)) + typeCounts[key]["date"]++; + else typeCounts[key]["string"]++; + } + } + }); + const columns = Object.keys(typeCounts); + for (const col of columns) { + // sort descending so most commonly encoutered type is first + const typesSorted = Object.keys(typeCounts[col]).sort(function (a, b) { + return typeCounts[col][b] - typeCounts[col][a]; + }); + let type = typesSorted[0]; + if (type === "other") { + // take the next-most-encountered type if most are "other", but only if + // its tally is greater than the next one in the list + if (typeCounts[typesSorted[1]] > typeCounts[typesSorted[2]]) + type = typesSorted[1]; + // else we could iterate over the sample and use the first encountered type + } + schema.push({ + name: col, + type: type + }); + } + return schema; +} \ No newline at end of file From ba09d45f71a847d19ed4d00e9cdc438dc3b47253 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Fri, 20 Jan 2023 09:55:13 -0800 Subject: [PATCH 3/7] Add schema validity check to address #9673 --- src/table.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 7f180da3..1dedda1f 100644 --- a/src/table.js +++ b/src/table.js @@ -544,7 +544,7 @@ export function getTypeValidator(colType) { // function to do table operations on in-memory data? export function __table(source, operations) { let {schema, columns} = source; - if (!schema) source.schema = inferSchema(source); + if (!schema || !isValidSchema(schema)) source.schema = inferSchema(source); const input = source; let primitive = arrayIsPrimitive(source); if (primitive) source = Array.from(source, (value) => ({value})); @@ -668,6 +668,13 @@ export function __table(source, operations) { return source; } +function isValidSchema(schema) { + if (!schema || !Array.isArray(schema)) return; + return schema.every((s) => { + s && typeof s.name === "string" && typeof s.type === "string"; + }); +} + function initKey() { return { other: 0, From 3a3f5a15cad0ea022a939f794c5cf2c0ff3e37ee Mon Sep 17 00:00:00 2001 From: Libbey White Date: Fri, 20 Jan 2023 11:19:56 -0800 Subject: [PATCH 4/7] Update tests --- test/table-test.js | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/test/table-test.js b/test/table-test.js index 0ef3e53d..e04d7a42 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -454,9 +454,10 @@ describe("__table", () => { const operationsNullColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: null}}; assert.deepStrictEqual(__table(source, operationsNullColumns), source); const operationsEmptyColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: []}}; - assert.deepStrictEqual(__table(source, operationsEmptyColumns), [{}, {}, {}]); + // comparing the result of .slice() removes schema from the comparison + assert.deepStrictEqual(__table(source, operationsEmptyColumns).slice(), [{}, {}, {}]); const operationsSelectedColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: ["a"]}}; - assert.deepStrictEqual(__table(source, operationsSelectedColumns), [{a: 1}, {a: 2}, {a: 3}]); + assert.deepStrictEqual(__table(source, operationsSelectedColumns).slice(), [{a: 1}, {a: 2}, {a: 3}]); }); it("__table unknown filter", () => { @@ -480,7 +481,8 @@ describe("__table", () => { {type: "gt", operands: [{type: "column", value: "b"}, {type: "resolved", value: 2}]} ] }; - assert.deepStrictEqual(__table(source, operationsComparison), [{a: 2, b: 4, c: 6}]); + // comparing the result of .slice() removes schema from the comparison + assert.deepStrictEqual(__table(source, operationsComparison).slice(), [{a: 2, b: 4, c: 6}]); }); it("__table filter lte + gte", () => { @@ -496,7 +498,8 @@ describe("__table", () => { {type: "gte", operands: [{type: "column", value: "b"}, {type: "resolved", value: 2.5}]} ] }; - assert.deepStrictEqual(__table(source, operationsComparison), [{a: 2, b: 4, c: 6}]); + // comparing the result of .slice() removes schema from the comparison + assert.deepStrictEqual(__table(source, operationsComparison).slice(), [{a: 2, b: 4, c: 6}]); }); it("__table filter primitive lte + gte", () => { @@ -526,8 +529,9 @@ describe("__table", () => { [{a: 3, b: 6, c: 9}, {a: 2, b: 4, c: 6}, {a: 1, b: 2, c: 3}] ); const operationsAsc = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "asc"}]}; + // comparing the result of .slice() removes schema from the comparison assert.deepStrictEqual( - __table(source, operationsAsc), + __table(source, operationsAsc).slice(), [{a: 1, b: 2, c: 3}, {a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}] ); const sourceExtended = [...source, {a: 1, b: 3, c: 3}, {a: 1, b: 5, c: 3}]; @@ -549,8 +553,9 @@ describe("__table", () => { [{a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: null}, {a: undefined}, {a: NaN}, {a: null}] ); const operationsAsc = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "asc"}]}; + // comparing the result of .slice() removes schema from the comparison assert.deepStrictEqual( - __table(sourceWithMissing, operationsAsc), + __table(sourceWithMissing, operationsAsc).slice(), [{a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: null}, {a: undefined}, {a: NaN}, {a: null}] ); }); @@ -561,8 +566,9 @@ describe("__table", () => { __table(source, operations), [{a: 3, b: 6, c: 9}, {a: 2, b: 4, c: 6}, {a: 1, b: 2, c: 3}] ); + // comparing the result of .slice() removes schema from the comparison assert.deepStrictEqual( - source, + source.slice(), [{a: 1, b: 2, c: 3}, {a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}] ); }); @@ -571,9 +577,10 @@ describe("__table", () => { const operationsToNull = {...EMPTY_TABLE_DATA.operations, slice: {from: 1, to: null}}; assert.deepStrictEqual(__table(source, operationsToNull), [{a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}]); const operationsFromNull = {...EMPTY_TABLE_DATA.operations, slice: {from: null, to: 1}}; - assert.deepStrictEqual(__table(source, operationsFromNull), [{a: 1, b: 2, c: 3}]); + // comparing the result of .slice() removes schema from the comparison + assert.deepStrictEqual(__table(source, operationsFromNull).slice(), [{a: 1, b: 2, c: 3}]); const operations = {...EMPTY_TABLE_DATA.operations, slice: {from: 1, to: 2}}; - assert.deepStrictEqual(__table(source, operations), [{a: 2, b: 4, c: 6}]); + assert.deepStrictEqual(__table(source, operations).slice(), [{a: 2, b: 4, c: 6}]); }); it("__table retains schema and columns info", () => { @@ -585,6 +592,13 @@ describe("__table", () => { [{name: "a", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}] ); }); + + it("__table infers schema", () => { + assert.deepStrictEqual( + __table(source, EMPTY_TABLE_DATA.operations).schema, + [{name: "a", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}] + ); + }); }); describe("getTypeValidator filters accurately", () => { From e151efd096df9585a1380c6d0d6333e322743342 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Fri, 20 Jan 2023 11:49:42 -0800 Subject: [PATCH 5/7] Handle sources that are arrays of primitives --- src/table.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 1dedda1f..cc5731f9 100644 --- a/src/table.js +++ b/src/table.js @@ -693,7 +693,10 @@ function initKey() { function inferSchema(source) { const schema = []; const sampleSize = 100; - const sample = source.slice(0, sampleSize); + let sample = source.slice(0, sampleSize); + if (arrayIsPrimitive(sample)) { + sample = sample.map(d => {return {value: d};}); + } const typeCounts = {}; sample.map((d) => { for (const key in d) { From ec38311aa6cb8e82e364ced7dc8967372c883c4f Mon Sep 17 00:00:00 2001 From: Libbey White Date: Sat, 21 Jan 2023 08:33:35 -0800 Subject: [PATCH 6/7] Formatting --- src/table.js | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/table.js b/src/table.js index cc5731f9..141ecc4e 100644 --- a/src/table.js +++ b/src/table.js @@ -695,7 +695,9 @@ function inferSchema(source) { const sampleSize = 100; let sample = source.slice(0, sampleSize); if (arrayIsPrimitive(sample)) { - sample = sample.map(d => {return {value: d};}); + sample = sample.map((d) => { + return {value: d}; + }); } const typeCounts = {}; sample.map((d) => { @@ -713,8 +715,7 @@ function inferSchema(source) { else if (value instanceof ArrayBuffer) typeCounts[key]["buffer"]++; else if (type in typeCounts[key]) typeCounts[key][type]++; // number, bigint, boolean, or object } else { - if (value === "true" || value === "false") - typeCounts[key]["boolean"]++; + if (value === "true" || value === "false") typeCounts[key]["boolean"]++; else if (!isNaN(+value) && /^-?[0-9]+$/.test(value)) typeCounts[key]["integer"]++; else if (!isNaN(+value)) typeCounts[key]["number"]++; From 6e9d64e12cd0aa3988c165b36f4075c03439ef1f Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 23 Jan 2023 13:13:16 -0800 Subject: [PATCH 7/7] Quick updates based on feedback --- src/table.js | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/table.js b/src/table.js index 141ecc4e..63a3d01a 100644 --- a/src/table.js +++ b/src/table.js @@ -700,41 +700,40 @@ function inferSchema(source) { }); } const typeCounts = {}; - sample.map((d) => { + for (const d of sample) { for (const key in d) { if (!typeCounts[key]) typeCounts[key] = initKey(); // for json and sqlite, we already have some types, but for csv and tsv, all // columns are strings here. const type = typeof d[key]; - const value = type === "string" ? d[key]?.trim() : d[key]; + const value = type === "string" ? d[key].trim() : d[key]; if (value === null || value === undefined || value.length === 0) - typeCounts[key]["other"]++; + typeCounts[key].other++; else if (type !== "string") { - if (Array.isArray(value)) typeCounts[key]["array"]++; - else if (value instanceof Date) typeCounts[key]["date"]++; - else if (value instanceof ArrayBuffer) typeCounts[key]["buffer"]++; + if (Array.isArray(value)) typeCounts[key].array++; + else if (value instanceof Date) typeCounts[key].date++; + else if (value instanceof ArrayBuffer) typeCounts[key].buffer++; else if (type in typeCounts[key]) typeCounts[key][type]++; // number, bigint, boolean, or object } else { - if (value === "true" || value === "false") typeCounts[key]["boolean"]++; - else if (!isNaN(+value) && /^-?[0-9]+$/.test(value)) - typeCounts[key]["integer"]++; - else if (!isNaN(+value)) typeCounts[key]["number"]++; - else if ( + if (value === "true" || value === "false") typeCounts[key].boolean++; + else if (!isNaN(value)) { + if (/^-?[0-9]+$/.test(value)) typeCounts[key].integer++; + else typeCounts[key].number++; + } else if ( value.match( /^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ ) ) - typeCounts[key]["date"]++; + typeCounts[key].date++; else if (value.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4}) (\d{2}):(\d{2})/)) - typeCounts[key]["date"]++; + typeCounts[key].date++; else if (value.match(/(\d{4})-(\d{1,2})-(\d{1,2})/)) - typeCounts[key]["date"]++; - else typeCounts[key]["string"]++; + typeCounts[key].date++; + else typeCounts[key].string++; } } - }); - const columns = Object.keys(typeCounts); - for (const col of columns) { + } + for (const col in typeCounts) { // sort descending so most commonly encoutered type is first const typesSorted = Object.keys(typeCounts[col]).sort(function (a, b) { return typeCounts[col][b] - typeCounts[col][a];