Implement new xlsx support based on openpyxl.

wireservice · Dec 26, 2011 · fd62299 · fd62299
1 parent a8567bb
commit fd62299
Show file tree

Hide file tree

Showing 5 changed files with 128 additions and 2 deletions.
diff --git a/csvkit/convert/__init__.py b/csvkit/convert/__init__.py
@@ -4,8 +4,9 @@
 from fixed import fixed2csv
 from js import json2csv
 from xls import xls2csv
+from xlsx import xlsx2csv
 
-SUPPORTED_FORMATS = ['fixed', 'xls', 'csv', 'json']
+SUPPORTED_FORMATS = ['fixed', 'xls', 'xlsx', 'csv', 'json']
 
 def convert(f, format, schema=None, key=None, **kwargs):
     """
@@ -24,6 +25,8 @@ def convert(f, format, schema=None, key=None, **kwargs):
         return fixed2csv(f, schema, **kwargs)
     elif format == 'xls':
         return xls2csv(f, **kwargs)
+    elif format == 'xlsx':
+        return xlsx2csv(f, **kwargs)
     elif format == 'json':
         return json2csv(f, key, **kwargs)
     elif format == 'csv':
@@ -45,6 +48,8 @@ def guess_format(filename):
 
     if extension == 'xls':
         return extension
+    elif extension == 'xlsx':
+        return extension
     elif extension in ['json', 'js']:
         return 'json' 
     elif extension == 'csv':

diff --git a/csvkit/convert/xlsx.py b/csvkit/convert/xlsx.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+from cStringIO import StringIO
+import datetime
+from types import NoneType
+
+from openpyxl.reader.excel import load_workbook
+
+from csvkit import table
+
+def normalize_empty(values, **kwargs):
+    """
+    Normalize a column which contains only empty cells.
+    """
+    return None, [None] * len(values)
+
+def normalize_unicode(values, **kwargs):
+    """
+    Normalize a column of text cells.
+    """
+    return unicode, [unicode(v) if v else None for v in values]
+
+def normalize_ints(values, **kwargs):
+    """
+    Normalize a column of integer cells.
+    """
+    return int, values 
+
+def normalize_floats(values, **kwargs):
+    """
+    Normalize a column of float cells.
+    """
+    return float, [float(v) for v in values]
+
+def normalize_datetimes(values, **kwargs):
+    """
+    Normalize a column of datetime cells.
+    """
+    return datetime.datetime, values
+
+def normalize_dates(values, **kwargs):
+    """
+    Normalize a column of date cells.
+    """
+    return datetime.date, values 
+
+def normalize_booleans(values, **kwargs):
+    """
+    Normalize a column of boolean cells.
+    """
+    return bool, [bool(v) if v != '' else None for v in values] 
+
+# TODO
+NORMALIZERS = {
+    unicode: normalize_unicode,
+    datetime.datetime: normalize_datetimes,
+    datetime.date: normalize_dates,
+    bool: normalize_booleans,
+    int: normalize_ints,
+    float: normalize_floats,
+    NoneType: normalize_empty
+}
+
+def determine_column_type(types):
+    """
+    Determine the correct type for a column from a list of cell types.
+    """
+    types_set = set(types)
+    types_set.discard(NoneType)
+
+    if len(types_set) == 2:
+        if types_set == set([int, float]):
+            return float
+        elif types_set == set([datetime.datetime, datetime.date]):
+            return datetime.datetime
+
+    # Normalize mixed types to text
+    if len(types_set) > 1:
+        return unicode
+
+    try:
+        return types_set.pop()
+    except KeyError:
+        return NoneType 
+
+def xlsx2csv(f, **kwargs):
+    """
+    Convert an Excel .xlsx file to csv.
+    """
+    book = load_workbook(f)
+    sheet = book.get_active_sheet()
+
+    tab = table.Table() 
+
+    for i, column in enumerate(sheet.columns):
+        # Trim headers
+        column_name = column[0].value
+
+        # Empty column name? Truncate remaining data
+        if not column_name:
+            break
+
+        values = [c.value for c in column[1:]]
+        types = [type(v) for v in values]
+
+        column_type = determine_column_type(types)
+        t, normal_values = NORMALIZERS[column_type](values)
+
+        column = table.Column(i, column_name, normal_values, normal_type=t)
+        tab.append(column)
+
+    o = StringIO()
+    output = tab.to_csv(o)
+    output = o.getvalue()
+    o.close()
+
+    return output 
+
diff --git a/csvkit/unicsv.py b/csvkit/unicsv.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 import codecs
 import csv
 from cStringIO import StringIO

diff --git a/csvkit/utilities/in2csv.py b/csvkit/utilities/in2csv.py
@@ -45,7 +45,7 @@ def main(self):
 
         if isinstance(self.args.file, file):
             f = self.args.file
-        elif format == 'xls':
+        elif format in ('xls', 'xlsx'):
             f = open(self.args.file, 'rb')
         else:
             f = open(self.args.file, 'rU')

diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,4 @@ python-dateutil==1.5
 sqlalchemy==0.6.6
 sphinx==1.0.7
 coverage==3.5.1b1
+openpyxl==1.5.6