Skip to content

Commit

Permalink
Changed CsvReaders to use FileRecordIndex internally to avoid clash w…
Browse files Browse the repository at this point in the history
…ith CachedCsvReader

Update benchmarks on current hardware
  • Loading branch information
phatcher committed Mar 24, 2018
1 parent 68c0154 commit e6be48b
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 100 deletions.
70 changes: 35 additions & 35 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -211,18 +211,18 @@ When addMark is true, consecutive null bytes will be replaced by [removed x null

Performance differences shown when tested with 20 million null bytes (20MB in storage) :
```csharp
CsvReader - without using NullRemovalStreamReader : 156248815 ticks, 57.2749 sec., 0.3492 MB/sec.
CsvReader - without using NullRemovalStreamReader : 536968 ticks, 0.2120 sec., 94.3518 MB/sec.

CsvReader - with NullRemovalStreamReader without mark : 447185 ticks, 0.1639 sec., 122.0100 MB/sec.
CsvReader - with NullRemovalStreamReader without mark : 191137 ticks, 0.0755 sec., 265.0660 MB/sec.
AddMark =(False) LastCell =(cell63 followed by 20971520 null bytes)

CsvReader - with NullRemovalStreamReader with mark : 447222 ticks, 0.1639 sec., 121.9999 MB/sec.
CsvReader - with NullRemovalStreamReader with mark : 168819 ticks, 0.0666 sec., 300.1079 MB/sec.
AddMark =(True) LastCell =(cell63 followed by 20971520 null bytes[removed 20971520 null bytes])
```

Adjust number of null bytes in benchmark to see how much memory/time you will be able to save:
```csharp
X:\Path\CsvReader\build\Debug\CsvReaderBenchmarks\net461>CsvReaderBenchmarks.exe CsvNullRemovalStreamReader
X:\Path\CsvReader\build\Debug\CsvReaderBenchmarks\net461>CsvReaderBenchmarks.exe NullRemoval
```


Expand All @@ -231,55 +231,55 @@ One of the main reasons for using this library is its excellent performance on r
```csharp
Test pass #1 - All fields

CsvReader - No cache : 3495429 ticks, 1.7940 sec., 24.5265 MB/sec.
CachedCsvReader - Run 1 : 6641089 ticks, 3.4084 sec., 12.9091 MB/sec.
CachedCsvReader - Run 2 : 4393 ticks, 0.0023 sec., 19515.3071 MB/sec.
TextFieldParser : 36877894 ticks, 18.9270 sec., 2.3247 MB/sec.
Regex : 15011358 ticks, 7.7044 sec., 5.7111 MB/sec.
CsvReader - No cache : 3134597 ticks, 1.2374 sec., 35.5582 MB/sec.
CachedCsvReader - Run 1 : 7452030 ticks, 2.9418 sec., 14.9571 MB/sec.
CachedCsvReader - Run 2 : 4525 ticks, 0.0018 sec., 24632.1821 MB/sec.
TextFieldParser : 31568009 ticks, 12.4617 sec., 3.5308 MB/sec.
Regex : 11273590 ticks, 4.4503 sec., 9.8869 MB/sec.

Test pass #1 - Field #72 (middle)

CsvReader - No cache : 2085871 ticks, 1.0705 sec., 41.1007 MB/sec.
CachedCsvReader - Run 1 : 6205399 ticks, 3.1848 sec., 13.8155 MB/sec.
CachedCsvReader - Run 2 : 214 ticks, 0.0001 sec., 400610.9533 MB/sec.
TextFieldParser : 36458115 ticks, 18.7116 sec., 2.3515 MB/sec.
Regex : 6976827 ticks, 3.5808 sec., 12.2879 MB/sec.
CsvReader - No cache : 2358656 ticks, 0.9311 sec., 47.2560 MB/sec.
CachedCsvReader - Run 1 : 7119186 ticks, 2.8104 sec., 15.6564 MB/sec.
CachedCsvReader - Run 2 : 325 ticks, 0.0001 sec., 342955.7662 MB/sec.
TextFieldParser : 31171440 ticks, 12.3052 sec., 3.5757 MB/sec.
Regex : 5793093 ticks, 2.2869 sec., 19.2403 MB/sec.


Test pass #2 - All fields

CsvReader - No cache : 3431492 ticks, 1.7612 sec., 24.9835 MB/sec.
CachedCsvReader - Run 1 : 6110812 ticks, 3.1363 sec., 14.0294 MB/sec.
CachedCsvReader - Run 2 : 173 ticks, 0.0001 sec., 495553.4335 MB/sec.
TextFieldParser : 36671647 ticks, 18.8212 sec., 2.3378 MB/sec.
Regex : 15064341 ticks, 7.7315 sec., 5.6910 MB/sec.
CsvReader - No cache : 2941954 ticks, 1.1614 sec., 37.8866 MB/sec.
CachedCsvReader - Run 1 : 7204077 ticks, 2.8439 sec., 15.4719 MB/sec.
CachedCsvReader - Run 2 : 314 ticks, 0.0001 sec., 354970.1401 MB/sec.
TextFieldParser : 31213609 ticks, 12.3218 sec., 3.5709 MB/sec.
Regex : 11095897 ticks, 4.3802 sec., 10.0452 MB/sec.

Test pass #2 - Field #72 (middle)

CsvReader - No cache : 2162568 ticks, 1.1099 sec., 39.6430 MB/sec.
CachedCsvReader - Run 1 : 5135074 ticks, 2.6355 sec., 16.6951 MB/sec.
CachedCsvReader - Run 2 : 220 ticks, 0.0001 sec., 389685.2000 MB/sec.
TextFieldParser : 36913575 ticks, 18.9453 sec., 2.3225 MB/sec.
Regex : 7107108 ticks, 3.6476 sec., 12.0627 MB/sec.
CsvReader - No cache : 2186909 ticks, 0.8633 sec., 50.9672 MB/sec.
CachedCsvReader - Run 1 : 7131654 ticks, 2.8153 sec., 15.6290 MB/sec.
CachedCsvReader - Run 2 : 296 ticks, 0.0001 sec., 376556.1622 MB/sec.
TextFieldParser : 31381026 ticks, 12.3879 sec., 3.5518 MB/sec.
Regex : 5151353 ticks, 2.0335 sec., 21.6372 MB/sec.


Test pass #3 - All fields

CsvReader - No cache : 3552781 ticks, 1.8234 sec., 24.1306 MB/sec.
CachedCsvReader - Run 1 : 5668694 ticks, 2.9094 sec., 15.1235 MB/sec.
CachedCsvReader - Run 2 : 186 ticks, 0.0001 sec., 460917.9785 MB/sec.
TextFieldParser : 36650220 ticks, 18.8102 sec., 2.3392 MB/sec.
Regex : 15108079 ticks, 7.7540 sec., 5.6745 MB/sec.
CsvReader - No cache : 2693834 ticks, 1.0634 sec., 41.3762 MB/sec.
CachedCsvReader - Run 1 : 7105358 ticks, 2.8049 sec., 15.6868 MB/sec.
CachedCsvReader - Run 2 : 326 ticks, 0.0001 sec., 341903.7546 MB/sec.
TextFieldParser : 31323784 ticks, 12.3653 sec., 3.5583 MB/sec.
Regex : 11303752 ticks, 4.4622 sec., 9.8605 MB/sec.

Test pass #3 - Field #72 (middle)

CsvReader - No cache : 2212999 ticks, 1.1358 sec., 38.7396 MB/sec.
CachedCsvReader - Run 1 : 5246701 ticks, 2.6928 sec., 16.3399 MB/sec.
CachedCsvReader - Run 2 : 214 ticks, 0.0001 sec., 400610.9533 MB/sec.
TextFieldParser : 36718316 ticks, 18.8451 sec., 2.3348 MB/sec.
Regex : 7049832 ticks, 3.6182 sec., 12.1607 MB/sec.
CsvReader - No cache : 2177773 ticks, 0.8597 sec., 51.1810 MB/sec.
CachedCsvReader - Run 1 : 7326816 ticks, 2.8923 sec., 15.2127 MB/sec.
CachedCsvReader - Run 2 : 328 ticks, 0.0001 sec., 339818.9756 MB/sec.
TextFieldParser : 31168390 ticks, 12.3040 sec., 3.5761 MB/sec.
Regex : 5134853 ticks, 2.0270 sec., 21.7067 MB/sec.


Done
```
This was run on a high-spec machine (Xeon E5-2620, 16Gb RAM and 512Gb SSD; you have to have some toys!) so the overall thoughput would be good, but CsvReader performs at 10x the speed the TextFieldParser and 5x faster than Regex
This was run on a high-spec machine (Xeon E5-2640, 32Gb RAM and M.2 1Tb SSD; you have to have some toys!) so the overall thoughput would be good, but CsvReader performs at 10x the speed the TextFieldParser and 5x faster than Regex
3 changes: 2 additions & 1 deletion RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#### 4.0.0 (2018-01-30)
#### 4.0.0 (2018-03-24)
* Support .NET Framework 2.0+ and .NET Standard 1.3+
* Use DateTime.TryParseExact if needed (@milcondoin)
* Override column capability (@spintronic)
* Fix handling of duplicate headers (@jonreis)
* Reduced exception overhead (@spintronic)
* Added NullBytesRemoval functionality to reduce memory usage and runtime (@andrewpsy)
* Use StringBuilder instead of string concatenation to speed up reading of huge cell (@andrewpsy)
* Fixed: Exception in GetFieldType() when hasHeaders is false (@molopony)
* Fixed: Exception when reading data and hasHeaders is false (@molopony)
* Fixed: ArgumentOutOfRangeException when hasHeaders is false (@molopony)
Expand Down
17 changes: 10 additions & 7 deletions code/CsvReaderBenchmarks/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,20 @@ private static void Main(string[] args)
{
if (args.Length == 1)
{
var s = args[0].ToUpper();

switch (s)
switch (args[0].ToUpperInvariant())
{
case "CSVREADER":
CsvReaderBenchmark.Run(TestFile3);
return;

case "CSVNULLREMOVALSTREAMREADER":
case "NULLREMOVAL":
PerformanceTestWithNullRemovalStreamReader();
return;
case "CSVSTRINGBUILDER":

case "STRINGBUILDER":
PerformanceTestWithStringBuilder();
return;

#if !NETCOREAPP1_0 && !NETCOREAPP2_0
case "OLEDB":
OleDbBenchmark.Run(TestFile3);
Expand All @@ -49,15 +49,18 @@ private static void Main(string[] args)
case "REGEX":
RegexBenchmark.Run(TestFile3);
return;

default:
Console.WriteLine(@"Possible values: CsvReader, NullRemoval, StringBuilder, OleDb, Regex");
break;
}
}

Console.WriteLine(@"Possible values : CsvReader, CsvNullRemovalStreamReader, CsvStringBuilder, OleDb, Regex");
return;
}

const int field = 72;
long fileSize = new FileInfo(TestFile2).Length / 1024 / 1024;
var fileSize = new FileInfo(TestFile2).Length / 1024 / 1024;

for (var i = 1; i < 4; i++)
{
Expand Down
65 changes: 30 additions & 35 deletions code/LumenWorks.Framework.IO/Csv/CachedCsvReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@ namespace LumenWorks.Framework.IO.Csv
/// <remarks>The number of records is limited to <see cref="System.Int32.MaxValue"/> - 1.</remarks>
public class CachedCsvReader : CsvReader, IListSource
{
/// <summary>
/// Contains the current record index (inside the cached records array).
/// </summary>
private long _currentRecordIndex;

/// <summary>
/// Indicates if a new record is being read from the CSV stream.
/// </summary>
Expand Down Expand Up @@ -148,26 +143,25 @@ public CachedCsvReader(TextReader reader, bool hasHeaders, char delimiter, char
: base(reader, hasHeaders, delimiter, quote, escape, comment, trimmingOptions, bufferSize, nullValue)
{
Records = new List<string[]>();
_currentRecordIndex = -1;
CacheRecordIndex = -1;
}

/// <summary>
/// Gets the current record index in the CSV file.
/// </summary>
/// <value>The current record index in the CSV file.</value>
public override long CurrentRecordIndex
{
get { return _currentRecordIndex; }
}
public override long CurrentRecordIndex => CacheRecordIndex;

/// <summary>
/// Contains the current record index (inside the cached records array).
/// </summary>
protected long CacheRecordIndex { get; private set; }

/// <summary>
/// Gets a value that indicates whether the current stream position is at the end of the stream.
/// </summary>
/// <value><see langword="true"/> if the current stream position is at the end of the stream; otherwise <see langword="false"/>.</value>
public override bool EndOfStream
{
get { return _currentRecordIndex >= base.CurrentRecordIndex && base.EndOfStream; }
}
public override bool EndOfStream => CacheRecordIndex >= FileRecordIndex && base.EndOfStream;

/// <summary>
/// Gets the field at the specified index.
Expand All @@ -187,14 +181,14 @@ public override string this[int field]
return base[field];
}

if (_currentRecordIndex > -1)
if (CacheRecordIndex > -1)
{
if (field > -1 && field < this.FieldCount)
if (field > -1 && field < FieldCount)
{
return Records[(int) _currentRecordIndex][field];
return Records[(int) CacheRecordIndex][field];
}

throw new ArgumentOutOfRangeException("field", field, string.Format(CultureInfo.InvariantCulture, ExceptionMessage.FieldIndexOutOfRange, field));
throw new ArgumentOutOfRangeException(nameof(field), field, string.Format(CultureInfo.InvariantCulture, ExceptionMessage.FieldIndexOutOfRange, field));
}

throw new InvalidOperationException(ExceptionMessage.NoCurrentRecord);
Expand All @@ -209,7 +203,7 @@ public override string this[int field]
/// </exception>
public virtual void ReadToEnd()
{
_currentRecordIndex = base.CurrentRecordIndex;
CacheRecordIndex = FileRecordIndex;

while (ReadNextRecord()) ;
}
Expand All @@ -231,38 +225,42 @@ public virtual void ReadToEnd()
/// </exception>
protected override bool ReadNextRecord(bool onlyReadHeaders, bool skipToNextLine)
{
if (_currentRecordIndex < base.CurrentRecordIndex)
if (CacheRecordIndex < FileRecordIndex)
{
_currentRecordIndex++;
CacheRecordIndex++;
return true;
}
else
{
_readingStream = true;

try
{
bool canRead = base.ReadNextRecord(onlyReadHeaders, skipToNextLine);
var canRead = base.ReadNextRecord(onlyReadHeaders, skipToNextLine);

if (canRead)
{
string[] record = new string[this.FieldCount];
var record = new string[FieldCount];

if (base.CurrentRecordIndex > -1)
if (FileRecordIndex > -1)
{
CopyCurrentRecordTo(record);
Records.Add(record);
}
else
{
if (MoveTo(0))
{
CopyCurrentRecordTo(record);
}

MoveTo(-1);
}

if (!onlyReadHeaders)
_currentRecordIndex++;
{
CacheRecordIndex++;
}
}
else
{
Expand All @@ -284,15 +282,15 @@ protected override bool ReadNextRecord(bool onlyReadHeaders, bool skipToNextLine
/// </summary>
public void MoveToStart()
{
_currentRecordIndex = -1;
CacheRecordIndex = -1;
}

/// <summary>
/// Moves to the last record read so far.
/// </summary>
public void MoveToLastCachedRecord()
{
_currentRecordIndex = base.CurrentRecordIndex;
CacheRecordIndex = FileRecordIndex;
}

/// <summary>
Expand All @@ -308,20 +306,17 @@ public override bool MoveTo(long record)
record = -1;
}

if (record <= base.CurrentRecordIndex)
if (record <= FileRecordIndex)
{
_currentRecordIndex = record;
CacheRecordIndex = record;
return true;
}

_currentRecordIndex = base.CurrentRecordIndex;
CacheRecordIndex = FileRecordIndex;
return base.MoveTo(record);
}

bool IListSource.ContainsListCollection
{
get { return false; }
}
bool IListSource.ContainsListCollection => false;

System.Collections.IList IListSource.GetList()
{
Expand Down
Loading

0 comments on commit e6be48b

Please sign in to comment.