Add IgnoreUnknownColumns to CsvFileDescription

- Modify article.htm to document the changes - Add unit test to make sure the new functionality works - Add ToString() method in `TypeFieldInfo` class for better debugging
mperdeck · Feb 28, 2014 · cc281ce · cc281ce
1 parent d86a25f
commit cc281ce
Show file tree

Hide file tree

Showing 6 changed files with 204 additions and 25 deletions.
diff --git a/LINQtoCSV.Tests/CsvContextReadTests.cs b/LINQtoCSV.Tests/CsvContextReadTests.cs
@@ -125,7 +125,7 @@ public void GoodFileCommaDelimitedUseFieldIndexForReadingDataCharUSEnglish()
             CsvFileDescription fileDescription_namesUs = new CsvFileDescription
             {
                 SeparatorChar = ',',
-                IgnoreMissingColumns = true,
+                IgnoreUnknownColumns = true,
                 UseFieldIndexForReadingData = true,
                 FirstLineHasColumnNames = false,
                 EnforceCsvColumnAttribute = true, // default is false
@@ -162,7 +162,7 @@ public void GoodFileCommaDelimitedUseFieldIndexForReadingDataCharUseOutputFormat
             CsvFileDescription fileDescription_namesUs = new CsvFileDescription
             {
                 SeparatorChar = ',',
-                IgnoreMissingColumns = true,
+                IgnoreUnknownColumns = true,
                 UseOutputFormatForParsingCsvValue = true,
 
                 UseFieldIndexForReadingData = true,
@@ -331,5 +331,41 @@ two newlines
 
             AssertRead(testInput, fileDescription_namesUs, expected);
         }
+
+        [TestMethod()]
+        public void FileWithUnknownColumns_ShouldDiscardColumns() {
+            var description = new CsvFileDescription
+                {
+                    SeparatorChar = ',',
+                    FirstLineHasColumnNames = true,
+                    IgnoreUnknownColumns = true,
+                };
+
+            //The following input has 5 columns: Id | Name | Last Name | Age | City. Only the Name, Last Name and Age will be read.
+
+            string input =
+@"Id,Name,Last Name,Age,City
+1,John,Doe,15,Washington
+2,Jane,Doe,20,New York
+";
+            var expected = new[]
+                {
+                    new Person
+                        {
+                            Name = "John",
+                            LastName = "Doe",
+                            Age = 15
+                        },
+                    new Person
+                        {
+                            Name = "Jane",
+                            LastName = "Doe",
+                            Age = 20
+                        },
+                };
+
+            AssertRead(input, description, expected);
+
+        }
     }
 }
diff --git a/LINQtoCSV.Tests/LINQtoCSV.Tests.csproj b/LINQtoCSV.Tests/LINQtoCSV.Tests.csproj
@@ -52,6 +52,7 @@
     <Compile Include="CsvContextWriteTests.cs" />
     <Compile Include="CsvContextReadTests.cs" />
     <Compile Include="IAssertable.cs" />
+    <Compile Include="Person.cs" />
     <Compile Include="ProductDataSpecificFieldIndex.cs" />
     <Compile Include="ProductData.cs" />
     <Compile Include="ProductData_DuplicateIndices.cs" />

diff --git a/LINQtoCSV.Tests/Person.cs b/LINQtoCSV.Tests/Person.cs
@@ -0,0 +1,23 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Microsoft.VisualStudio.TestTools.UnitTesting;
+
+namespace LINQtoCSV.Tests
+{
+    public class Person : IAssertable<Person> {
+        [CsvColumn(Name = "Name")]
+        public string Name { get; set; }
+        [CsvColumn(Name = "Last Name")]
+        public string LastName { get; set; }
+        [CsvColumn(Name = "Age")]
+        public int Age { get; set; }
+
+        public void AssertEqual(Person other) {
+            Assert.AreEqual(other.Name, Name);
+            Assert.AreEqual(other.LastName, LastName);
+            Assert.AreEqual(other.Age, Age);
+        }
+    }
+}
diff --git a/LINQtoCSV/CsvFileDescription.cs b/LINQtoCSV/CsvFileDescription.cs
@@ -87,10 +87,14 @@ public int MaximumNbrExceptions
         public Encoding TextEncoding { get; set; }
         public bool DetectEncodingFromByteOrderMarks { get; set; }
 
-        public bool IgnoreMissingColumns { get; set; }
         public bool UseFieldIndexForReadingData { get; set; }
         public bool UseOutputFormatForParsingCsvValue { get; set; }
         public bool IgnoreTrailingSeparatorChar { get; set; }
+
+        /// <summary>
+        /// If set to true, wil read only the fields specified as attributes, and will discard other fields in the CSV file
+        /// </summary>
+        public bool IgnoreUnknownColumns { get; set; }
 
         // ---------------
 
@@ -107,6 +111,7 @@ public CsvFileDescription()
             NoSeparatorChar = false;
             UseFieldIndexForReadingData = false;
             UseOutputFormatForParsingCsvValue = false;
+            IgnoreUnknownColumns = false;
         }
     }
 }
diff --git a/LINQtoCSV/FieldMapper.cs b/LINQtoCSV/FieldMapper.cs
@@ -39,6 +39,10 @@ public int CompareTo(TypeFieldInfo other)
             {
                 return index.CompareTo(other.index);
             }
+
+            public override string ToString() {
+                return string.Format("Index: {0}, Name: {1}", index, name);
+            }
         }
 
         // -----------------------------
@@ -47,6 +51,11 @@ public int CompareTo(TypeFieldInfo other)
         // to its TypeFieldInfo.
         protected TypeFieldInfo[] m_IndexToInfo = null;
 
+        /// <summary>
+        /// Contains a mapping between the CSV column indexes that will read and the property indexes in the business object.
+        /// </summary>
+        protected IDictionary<int, int> _mappingIndexes = new Dictionary<int, int>(); 
+
         // Used to build IndexToInfo
         protected Dictionary<string, TypeFieldInfo> m_NameToInfo = null;
 
@@ -191,6 +200,8 @@ public int CompareTo(TypeFieldInfo other)
 
             int nbrTypeFields = m_NameToInfo.Keys.Count;
             m_IndexToInfo = new TypeFieldInfo[nbrTypeFields];
+
+            _mappingIndexes = new Dictionary<int, int>();
 
             int i=0;
             foreach (KeyValuePair<string, TypeFieldInfo> kvp in m_NameToInfo)
@@ -371,6 +382,7 @@ internal class FieldMapper_Reading<T> : FieldMapper<T> where T : new()
                     bool writingFile)
             : base(fileDescription, fileName, writingFile)
         {
+
         }
 
 
@@ -392,25 +404,39 @@ public void ReadNames(IDataRow row)
             // the FieldIndex fields.
 
             // If there are more names in the file then fields in the type,
-            // one of the names will not be found, causing an exception.
+            // and IgnoreUnknownColumns is set to `false` one of the names will 
+            // not be found, causing an exception.
+
+            int currentNameIndex = 0;
+            for (int i = 0; i < row.Count; i++) {
+                if (!m_NameToInfo.ContainsKey(row[i].Value)) {
+                    //If we have to ignore this column
+                    if (m_fileDescription.IgnoreUnknownColumns) {
+                        continue;
+                    }
 
-            for (int i = 0; i < row.Count; i++)
-            {
-                if (!m_NameToInfo.ContainsKey(row[i].Value))
-                {
                     // name not found
-                    throw new NameNotInTypeException(typeof(T).ToString(), row[i].Value, m_fileName);
+                    throw new NameNotInTypeException(typeof (T).ToString(), row[i].Value, m_fileName);
                 }
 
                 // ----
 
-                m_IndexToInfo[i] = m_NameToInfo[row[i].Value];
+                //Map the column index in the CSV file with the column index of the business object.
+                _mappingIndexes.Add(i, currentNameIndex);
+                currentNameIndex++;
+            }
 
-                if (m_fileDescription.EnforceCsvColumnAttribute &&
-                    (!m_IndexToInfo[i].hasColumnAttribute))
-                {
+            //Loop to the 
+            for (int i = 0; i < row.Count; i++) {
+                if (!_mappingIndexes.ContainsKey(i)) {
+                    continue;
+                }
+
+                m_IndexToInfo[_mappingIndexes[i]] = m_NameToInfo[row[i].Value];
+
+                if (m_fileDescription.EnforceCsvColumnAttribute && (!m_IndexToInfo[i].hasColumnAttribute)) {
                     // enforcing column attr, but this field/prop has no column attr.
-                    throw new MissingCsvColumnAttributeException(typeof(T).ToString(), row[i].Value, m_fileName);
+                    throw new MissingCsvColumnAttributeException(typeof (T).ToString(), row[i].Value, m_fileName);
                 }
             }
         }
@@ -433,24 +459,37 @@ public List<int> GetCharLengths()
         /// <param name="row"></param>
         /// <param name="firstRow"></param>
         /// <returns></returns>
-        public T ReadObject(IDataRow row, AggregatedException ae)
-        {
-            if (!m_fileDescription.IgnoreMissingColumns && row.Count > m_IndexToInfo.Length)
+        public T ReadObject(IDataRow row, AggregatedException ae) {
+            //If there are more columns than the required
+            if (row.Count > m_IndexToInfo.Length)
             {
-                // Too many fields
-                throw new TooManyDataFieldsException(typeof(T).ToString(), row[0].LineNbr, m_fileName);
+                //Are we ignoring unknown columns?
+                if (!m_fileDescription.IgnoreUnknownColumns) {
+                    // Too many fields
+                    throw new TooManyDataFieldsException(typeof (T).ToString(), row[0].LineNbr, m_fileName);
+                }
             }
 
             // -----
 
             T obj = new T();
 
-            int maxRowCount = Math.Min(row.Count, m_IndexToInfo.Length);
+            //If we will be using the mappings, we just iterate through all the cells in this row
+            int maxRowCount = _mappingIndexes.Count > 0 ? row.Count : Math.Min(row.Count, m_IndexToInfo.Length);
+
+            for (int i = 0; i < maxRowCount; i++) {
+                TypeFieldInfo tfi;
+                //If there is some index mapping generated and the IgnoreUnknownColums is `true`
+                if (m_fileDescription.IgnoreUnknownColumns && _mappingIndexes.Count > 0) {
+                    if (!_mappingIndexes.ContainsKey(i)) {
+                        continue;
+                    }
+                    tfi = m_IndexToInfo[_mappingIndexes[i]];
+                }
+                else {
+                    tfi = m_IndexToInfo[i];
+                }
 
-            for (int i = 0; i < maxRowCount; i++)
-            {
-                TypeFieldInfo tfi = m_IndexToInfo[i];
-
                 if (m_fileDescription.EnforceCsvColumnAttribute &&
                         (!tfi.hasColumnAttribute))
                 {

diff --git a/article.htm b/article.htm
@@ -465,7 +465,10 @@ <h2>CsvFileDescription<a id="CsvFileDescription"></a></h2>
 <li><a href="#NoSeparatorChar"><code>NoSeparatorChar</code></a> </li>
 
 <li><a href="#UseFieldIndexForReadingData"><code>UseFieldIndexForReadingData</code></a> </li>
-    <li><a href="#IgnoreTrailingSeparatorChar"><code>IgnoreTrailingSeparatorChar</code></a> </li>
+
+<li><a href="#IgnoreTrailingSeparatorChar"><code>IgnoreTrailingSeparatorChar</code></a> </li>
+
+<li><a href="#IgnoreUnknownColumns"><code>IgnoreUnknownColumns</code></a> </li>
 </ul>
 
 <h3><a id="SeparatorChar">SeparatorChar</a></h3>
@@ -838,6 +841,78 @@ <h4>Example:</h4>
 
 <p>Though it's not a canonical representation of CSV file, <code>IgnoreTrailingSeparatorChar</code> property tells <code>Read</code> to ignore separator character at the end of the line.</p>
 
+<!-----------IgnoreUnknownColumns----------->
+<h3><a id="IgnoreUnknownColumns"></a>IgnoreUnknownColumns</h3>
+
+<table cellpadding="3">
+    <tbody>
+        <tr>
+            <td><strong>Type:</strong></td>
+
+            <td><code lang="cs">bool</code></td>
+        </tr>
+
+        <tr>
+            <td><strong>Default:</strong></td>
+
+            <td>false</td>
+        </tr>
+
+        <tr>
+            <td><strong>Applies to:</strong></td>
+
+            <td>Reading only</td>
+        </tr>
+    </tbody>
+</table>
+
+<h4>Example:</h4>
+
+There are cases where you don't need to read all the columns, but only a subset of them. Consider the following example of a CSV file containing a list of people:<br><br>
+
+<table cellspacing="0" cellpadding="5" border="1">
+    <tbody>
+        <tr>
+            <th>Id</th>
+            <th>Name</th>
+            <th>Last Name</th>
+            <th>Age</th>
+            <th>City</th>
+        </tr>
+        <tr>
+            <td>1</td>
+            <td>John</td>
+            <td>Doe</td>
+            <td>15</td>
+            <td>Washington</td>
+        </tr>
+        <tr>
+            <td>2</td>
+            <td>Jane</td>
+            <td>Doe</td>
+            <td>20</td>
+            <td>New York</td>
+        </tr>
+    </tbody>
+</table>
+<br>
+Suppose you have the following class:
+<pre lang="cs">
+    class Person {
+        [CsvColumn(Name = "Name")]
+        public string Name { get ; set; }
+        [CsvColumn(Name = "Last Name")]
+        public string LastName { get; set; }
+        [CsvColumn(Name = "Age")]
+        public int Age { get; set; }
+    }
+</pre>
+If you set <pre lang="cs">fd.IgnoreTrailingSeparatorChar = true;</pre>
+
+then the fields <code>Id</code> and <code>City</code> will be ignored.
+
+<!---------------------------------------------->
+
 <h2>CsvColumn Attribute<a id="CsvColumn_Attribute"></a></h2>
 
 <p>As shown in the <a href="#How_to_use">Reading from a file</a> and <a href="#writing_to_a_file">Writing to a file</a> examples, you can decorate the public fields and properties of your data class with the <code>CsvColumn</code> attribute to specify such things as the output format for date and number fields.</p>