Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added an option to escape delimiters in CSV files when using bulk copy #1312

Merged
merged 6 commits into from
May 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ public class SQLServerBulkCSVFileRecord extends SQLServerBulkRecord implements j
*/
private final String delimiter;

private boolean escapeDelimiters;

// Regex to ignore delimiter when the field is enclosed in quotes.
private static final String escapeSplitPattern = "(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)";
ulvii marked this conversation as resolved.
Show resolved Hide resolved

/*
* Class names for logging.
*/
Expand Down Expand Up @@ -189,15 +194,18 @@ private void initFileReader(InputStreamReader sr, String encoding, String demlim
if (firstLineIsColumnNames) {
currentLine = fileReader.readLine();
if (null != currentLine) {
columnNames = currentLine.split(delimiter, -1);
columnNames = (escapeDelimiters && currentLine.contains("\""))
peterbae marked this conversation as resolved.
Show resolved Hide resolved
? escapeQuotesRFC4180(currentLine.split(
delimiter + escapeSplitPattern))
: currentLine.split(delimiter, -1);
}
}
}

private void initLoggerResources() {
super.loggerPackageName = "com.microsoft.sqlserver.jdbc.SQLServerBulkCSVFileRecord";
}

/**
* Releases any resources associated with the file reader.
*
Expand Down Expand Up @@ -229,21 +237,22 @@ public Object[] getRowData() throws SQLServerException {
if (null == currentLine)
return null;
else {
// Binary data may be corrupted
// The limit in split() function should be a negative value,
// otherwise trailing empty strings are discarded.
// Empty string is returned if there is no value.
String[] data = currentLine.split(delimiter, -1);

// Cannot go directly from String[] to Object[] and expect it to act
// as an array.
/*
* Binary data may be corrupted The limit in split() function should be a negative value, otherwise trailing
* empty strings are discarded. Empty string is returned if there is no value.
*/
String[] data = (escapeDelimiters && currentLine.contains("\""))
? escapeQuotesRFC4180(currentLine.split(
delimiter + escapeSplitPattern))
: currentLine.split(delimiter, -1);

// Cannot go directly from String[] to Object[] and expect it to act as an array.

Object[] dataRow = new Object[data.length];

for (Entry<Integer, ColumnMetadata> pair : columnMetadata.entrySet()) {
ColumnMetadata cm = pair.getValue();

// Reading a column not available in csv
// positionInFile > number of columns retrieved after split
// Reading a column not available in csv positionInFile > number of columns retrieved after split
if (data.length < pair.getKey() - 1) {
MessageFormat form = new MessageFormat(SQLServerException.getErrString("R_invalidColumn"));
Object[] msgArgs = {pair.getKey()};
Expand Down Expand Up @@ -271,8 +280,7 @@ public Object[] getRowData() throws SQLServerException {
* data (say "10") is to be inserted into an numeric column. Our implementation does the same.
*/
case Types.INTEGER: {
// Formatter to remove the decimal part as SQL
// Server floors the decimal in integer types
// Formatter to remove the decimal part as SQL Server floors the decimal in integer types.
DecimalFormat decimalFormatter = new DecimalFormat("#");
decimalFormatter.setRoundingMode(RoundingMode.DOWN);
String formatedfInput = decimalFormatter
Expand All @@ -283,8 +291,7 @@ public Object[] getRowData() throws SQLServerException {

case Types.TINYINT:
case Types.SMALLINT: {
// Formatter to remove the decimal part as SQL
// Server floors the decimal in integer types
// Formatter to remove the decimal part as SQL Server floors the decimal in integer types.
DecimalFormat decimalFormatter = new DecimalFormat("#");
decimalFormatter.setRoundingMode(RoundingMode.DOWN);
String formatedfInput = decimalFormatter
Expand Down Expand Up @@ -315,9 +322,7 @@ public Object[] getRowData() throws SQLServerException {
}

case Types.BIT: {
// "true" => 1, "false" => 0
// Any non-zero value (integer/double) => 1, 0/0.0
// => 0
// "true" => 1, "false" => 0. Any non-zero value (integer/double) => 1, 0/0.0 => 0
try {
dataRow[pair.getKey()
- 1] = (0 == Double.parseDouble(data[pair.getKey() - 1])) ? Boolean.FALSE
Expand Down Expand Up @@ -485,9 +490,10 @@ else if ((null != columnNames) && (columnNames.length >= positionInSource))
columnMetadata.put(positionInSource,
new ColumnMetadata(colName, java.sql.Types.LONGNVARCHAR, precision, scale, dateTimeFormatter));
break;

// Redirecting Float as Double based on data type mapping
// https://msdn.microsoft.com/en-us/library/ms378878%28v=sql.110%29.aspx
/*
* Redirecting Float as Double based on data type mapping
* https://msdn.microsoft.com/library/ms378878%28v=sql.110%29.aspx
*/
case java.sql.Types.FLOAT:
columnMetadata.put(positionInSource,
new ColumnMetadata(colName, java.sql.Types.DOUBLE, precision, scale, dateTimeFormatter));
Expand Down Expand Up @@ -516,4 +522,63 @@ public boolean next() throws SQLServerException {
}
return (null != currentLine);
}

/**
* Returns whether the rules to escape delimiters are used.
*
* @return true if the rules are used, false otherwise.
*/
public boolean isEscapeColumnDelimitersCSV() {
peterbae marked this conversation as resolved.
Show resolved Hide resolved
return escapeDelimiters;
}

/**
* When set to true, the following rules will be used to parse CSV files: Each field may or may not be enclosed in
* double quotes. If fields are not enclosed with double quotes, then double quotes may not appear inside the
* fields. Fields containing double quotes, and delimiters should be enclosed in double quotes. If double-quotes are
* used to enclose fields, then a double-quote appearing inside a field must be escaped by preceding it with another
* double quote. Spaces are considered part of a field. Spaces before and after enclosing double quotes are ignored.
*
* @param escapeDelimiters
* true if the rules above to be used.
*/
public void setEscapeColumnDelimitersCSV(boolean escapeDelimiters) {
this.escapeDelimiters = escapeDelimiters;
}

private static String[] escapeQuotesRFC4180(String[] tokens) throws SQLServerException {
if (null == tokens) {
return tokens;
}
for (int i = 0; i < tokens.length; i++) {
boolean escaped = false;
int j = 0;
StringBuilder sb = new StringBuilder();
long quoteCount = tokens[i].chars().filter(ch -> ch == '"').count();
peterbae marked this conversation as resolved.
Show resolved Hide resolved
if (quoteCount > 0) {
tokens[i] = tokens[i].trim();
}
if (0 != quoteCount % 2 || (quoteCount > 0
&& ('"' != tokens[i].charAt(0) || '"' != tokens[i].charAt(tokens[i].length() - 1)))) {
throw new SQLServerException(SQLServerException.getErrString("R_InvalidCSVQuotes"), null, 0, null);
}
while (j < tokens[i].length()) {
if ('"' == tokens[i].charAt(j)) {
if (!escaped) {
escaped = true;
} else {
if ((j < tokens[i].length() - 1) && '"' == tokens[i].charAt(j + 1)) {
ulvii marked this conversation as resolved.
Show resolved Hide resolved
sb.append('"');
j++;
}
}
} else {
sb.append(tokens[i].charAt(j));
}
j++;
}
tokens[i] = sb.toString();
}
return tokens;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -639,5 +639,7 @@ protected Object[][] getContents() {
{"R_pvkParseError", "Could not read Private Key from PVK, check the password provided."},
{"R_pvkHeaderError", "Cannot parse the PVK, PVK file does not contain the correct header."},
{"R_clientCertError", "Reading client certificate failed. Please verify the location of the certificate."},
{"R_unassignableError", "The class specified by the {0} property must be assignable to {1}."}};
{"R_unassignableError", "The class specified by the {0} property must be assignable to {1}."},
{"R_InvalidCSVQuotes", "Failed to parse the CSV file, verify that the fields are correctly enclosed in double quotes."},
};
};
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,21 @@
package com.microsoft.sqlserver.jdbc.bulkCopy;

import static org.junit.Assert.fail;
import static org.junit.jupiter.api.Assertions.assertEquals;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Arrays;

import org.junit.jupiter.api.AfterAll;
Expand All @@ -26,9 +31,11 @@
import org.junit.runner.RunWith;

import com.microsoft.sqlserver.jdbc.ComparisonUtil;
import com.microsoft.sqlserver.jdbc.RandomUtil;
import com.microsoft.sqlserver.jdbc.SQLServerBulkCSVFileRecord;
import com.microsoft.sqlserver.jdbc.SQLServerBulkCopy;
import com.microsoft.sqlserver.jdbc.TestUtils;
import com.microsoft.sqlserver.testframework.AbstractSQLGenerator;
import com.microsoft.sqlserver.testframework.AbstractTest;
import com.microsoft.sqlserver.testframework.Constants;
import com.microsoft.sqlserver.testframework.DBConnection;
Expand All @@ -54,6 +61,7 @@ public class BulkCopyCSVTest extends AbstractTest {

static String inputFile = "BulkCopyCSVTestInput.csv";
static String inputFileNoColumnName = "BulkCopyCSVTestInputNoColumnName.csv";
static String inputFileDelimiterEscape = "BulkCopyCSVTestInputDelimiterEscape.csv";
static String encoding = "UTF-8";
static String delimiter = ",";

Expand All @@ -77,9 +85,13 @@ public static void setUpConnection() {
@Test
@DisplayName("Test SQLServerBulkCSVFileRecord")
public void testCSV() {
try (SQLServerBulkCSVFileRecord fileRecord = new SQLServerBulkCSVFileRecord(filePath + inputFile, encoding,
delimiter, true)) {
testBulkCopyCSV(fileRecord, true);
String fileName = filePath + inputFile;
try (SQLServerBulkCSVFileRecord f1 = new SQLServerBulkCSVFileRecord(fileName, encoding, delimiter, true);
SQLServerBulkCSVFileRecord f2 = new SQLServerBulkCSVFileRecord(fileName, encoding, delimiter, true);) {
testBulkCopyCSV(f1, true);

f2.setEscapeColumnDelimitersCSV(true);
testBulkCopyCSV(f2, true);
} catch (SQLException e) {
fail(e.getMessage());
}
Expand All @@ -91,9 +103,13 @@ public void testCSV() {
@Test
@DisplayName("Test SQLServerBulkCSVFileRecord First line not being column name")
public void testCSVFirstLineNotColumnName() {
try (SQLServerBulkCSVFileRecord fileRecord = new SQLServerBulkCSVFileRecord(filePath + inputFileNoColumnName,
encoding, delimiter, false)) {
testBulkCopyCSV(fileRecord, false);
String fileName = filePath + inputFileNoColumnName;
try (SQLServerBulkCSVFileRecord f1 = new SQLServerBulkCSVFileRecord(fileName, encoding, delimiter, false);
SQLServerBulkCSVFileRecord f2 = new SQLServerBulkCSVFileRecord(fileName, encoding, delimiter, false)) {
testBulkCopyCSV(f1, false);

f2.setEscapeColumnDelimitersCSV(true);
testBulkCopyCSV(f2, false);
} catch (SQLException e) {
fail(e.getMessage());
}
Expand All @@ -118,6 +134,63 @@ public void testCSVFromURL() throws SQLException {
}
}

/**
* A test to validate that the driver parses CSV file according to RFC4180 when setEscapeColumnDelimitersCSV is set
* to true.
*
* @throws Exception
*/
@Test
ulvii marked this conversation as resolved.
Show resolved Hide resolved
@DisplayName("Test setEscapeColumnDelimitersCSV")
public void testEscapeColumnDelimitersCSV() throws Exception {
String tableName = AbstractSQLGenerator.escapeIdentifier(RandomUtil.getIdentifier("BulkEscape"));
String fileName = filePath + inputFileDelimiterEscape;
/*
* The list below is the copy of inputFileDelimiterEscape with quotes removed.
*/
String[][] expectedEscaped = new String[11][4];
expectedEscaped[0] = new String[] {"test", " test\"", "no@split", " testNoQuote"};
expectedEscaped[1] = new String[] {null, null, null, null};
expectedEscaped[2] = new String[] {"\"", "test\"test", "test@\" test", null};
expectedEscaped[3] = new String[] {"testNoQuote ", " testSpaceAround ", " testSpaceInside ",
" testSpaceQuote\" "};
expectedEscaped[4] = new String[] {null, null, null, " testSpaceInside "};
expectedEscaped[5] = new String[] {"1997", "Ford", "E350", "E63"};
expectedEscaped[6] = new String[] {"1997", "Ford", "E350", "E63"};
expectedEscaped[7] = new String[] {"1997", "Ford", "E350", "Super@ luxurious truck"};
expectedEscaped[8] = new String[] {"1997", "Ford", "E350", "Super@ \"luxurious\" truck"};
expectedEscaped[9] = new String[] {"1997", "Ford", "E350", "E63"};
expectedEscaped[10] = new String[] {"1997", "Ford", "E350", " Super luxurious truck "};

try (Connection con = getConnection(); Statement stmt = con.createStatement();
SQLServerBulkCopy bulkCopy = new SQLServerBulkCopy(con);
SQLServerBulkCSVFileRecord fileRecord = new SQLServerBulkCSVFileRecord(fileName, encoding, "@",
false)) {
bulkCopy.setDestinationTableName(tableName);
fileRecord.setEscapeColumnDelimitersCSV(true);
fileRecord.addColumnMetadata(1, null, java.sql.Types.INTEGER, 0, 0);
fileRecord.addColumnMetadata(2, null, java.sql.Types.VARCHAR, 50, 0);
fileRecord.addColumnMetadata(3, null, java.sql.Types.VARCHAR, 50, 0);
fileRecord.addColumnMetadata(4, null, java.sql.Types.VARCHAR, 50, 0);
fileRecord.addColumnMetadata(5, null, java.sql.Types.VARCHAR, 50, 0);
stmt.executeUpdate("CREATE TABLE " + tableName
+ " (id INT IDENTITY(1,1), c1 VARCHAR(50), c2 VARCHAR(50), c3 VARCHAR(50), c4 VARCHAR(50))");
bulkCopy.writeToServer(fileRecord);

int i = 0;
try (ResultSet rs = stmt.executeQuery("SELECT * FROM " + tableName + " ORDER BY id");
BufferedReader br = new BufferedReader(new FileReader(fileName));) {
while (rs.next()) {
assertEquals(expectedEscaped[i][0], rs.getString("c1"));
assertEquals(expectedEscaped[i][1], rs.getString("c2"));
assertEquals(expectedEscaped[i][2], rs.getString("c3"));
assertEquals(expectedEscaped[i][3], rs.getString("c4"));
i++;
}
}
}
}

private void testBulkCopyCSV(SQLServerBulkCSVFileRecord fileRecord, boolean firstLineIsColumnNames) {
DBTable destTable = null;
try (BufferedReader br = new BufferedReader(
Expand Down
11 changes: 11 additions & 0 deletions src/test/resources/BulkCopyCSVTestInputDelimiterEscape.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
1@"test"@ " test"""@ "no@split" @ testNoQuote
2@""@ ""@ ""@ ""
3@""""@ "test""test"@ "test@"" test"@ ""
4@testNoQuote @ testSpaceAround @ " testSpaceInside "@ " testSpaceQuote"" "
5@""@ ""@ ""@ " testSpaceInside "
6@1997@Ford@E350@E63
7@"1997"@"Ford"@"E350"@"E63"
8@1997@Ford@E350@"Super@ luxurious truck"
9@1997@Ford@E350@"Super@ ""luxurious"" truck"
10@1997@ "Ford" @E350@ "E63"
11@1997@Ford@E350@" Super luxurious truck "