Skip to content

Commit

Permalink
Implement new xlsx support based on openpyxl.
Browse files Browse the repository at this point in the history
  • Loading branch information
onyxfish committed Dec 26, 2011
1 parent a8567bb commit fd62299
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 2 deletions.
7 changes: 6 additions & 1 deletion csvkit/convert/__init__.py
Expand Up @@ -4,8 +4,9 @@
from fixed import fixed2csv
from js import json2csv
from xls import xls2csv
from xlsx import xlsx2csv

SUPPORTED_FORMATS = ['fixed', 'xls', 'csv', 'json']
SUPPORTED_FORMATS = ['fixed', 'xls', 'xlsx', 'csv', 'json']

def convert(f, format, schema=None, key=None, **kwargs):
"""
Expand All @@ -24,6 +25,8 @@ def convert(f, format, schema=None, key=None, **kwargs):
return fixed2csv(f, schema, **kwargs)
elif format == 'xls':
return xls2csv(f, **kwargs)
elif format == 'xlsx':
return xlsx2csv(f, **kwargs)
elif format == 'json':
return json2csv(f, key, **kwargs)
elif format == 'csv':
Expand All @@ -45,6 +48,8 @@ def guess_format(filename):

if extension == 'xls':
return extension
elif extension == 'xlsx':
return extension
elif extension in ['json', 'js']:
return 'json'
elif extension == 'csv':
Expand Down
118 changes: 118 additions & 0 deletions csvkit/convert/xlsx.py
@@ -0,0 +1,118 @@
#!/usr/bin/env python

from cStringIO import StringIO
import datetime
from types import NoneType

from openpyxl.reader.excel import load_workbook

from csvkit import table

def normalize_empty(values, **kwargs):
"""
Normalize a column which contains only empty cells.
"""
return None, [None] * len(values)

def normalize_unicode(values, **kwargs):
"""
Normalize a column of text cells.
"""
return unicode, [unicode(v) if v else None for v in values]

def normalize_ints(values, **kwargs):
"""
Normalize a column of integer cells.
"""
return int, values

def normalize_floats(values, **kwargs):
"""
Normalize a column of float cells.
"""
return float, [float(v) for v in values]

def normalize_datetimes(values, **kwargs):
"""
Normalize a column of datetime cells.
"""
return datetime.datetime, values

def normalize_dates(values, **kwargs):
"""
Normalize a column of date cells.
"""
return datetime.date, values

def normalize_booleans(values, **kwargs):
"""
Normalize a column of boolean cells.
"""
return bool, [bool(v) if v != '' else None for v in values]

# TODO
NORMALIZERS = {
unicode: normalize_unicode,
datetime.datetime: normalize_datetimes,
datetime.date: normalize_dates,
bool: normalize_booleans,
int: normalize_ints,
float: normalize_floats,
NoneType: normalize_empty
}

def determine_column_type(types):
"""
Determine the correct type for a column from a list of cell types.
"""
types_set = set(types)
types_set.discard(NoneType)

if len(types_set) == 2:
if types_set == set([int, float]):
return float
elif types_set == set([datetime.datetime, datetime.date]):
return datetime.datetime

# Normalize mixed types to text
if len(types_set) > 1:
return unicode

try:
return types_set.pop()
except KeyError:
return NoneType

def xlsx2csv(f, **kwargs):
"""
Convert an Excel .xlsx file to csv.
"""
book = load_workbook(f)
sheet = book.get_active_sheet()

tab = table.Table()

for i, column in enumerate(sheet.columns):
# Trim headers
column_name = column[0].value

# Empty column name? Truncate remaining data
if not column_name:
break

values = [c.value for c in column[1:]]
types = [type(v) for v in values]

column_type = determine_column_type(types)
t, normal_values = NORMALIZERS[column_type](values)

column = table.Column(i, column_name, normal_values, normal_type=t)
tab.append(column)

o = StringIO()
output = tab.to_csv(o)
output = o.getvalue()
o.close()

return output

2 changes: 2 additions & 0 deletions csvkit/unicsv.py
@@ -1,3 +1,5 @@
#!/usr/bin/env python

import codecs
import csv
from cStringIO import StringIO
Expand Down
2 changes: 1 addition & 1 deletion csvkit/utilities/in2csv.py
Expand Up @@ -45,7 +45,7 @@ def main(self):

if isinstance(self.args.file, file):
f = self.args.file
elif format == 'xls':
elif format in ('xls', 'xlsx'):
f = open(self.args.file, 'rb')
else:
f = open(self.args.file, 'rU')
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -5,3 +5,4 @@ python-dateutil==1.5
sqlalchemy==0.6.6
sphinx==1.0.7
coverage==3.5.1b1
openpyxl==1.5.6

0 comments on commit fd62299

Please sign in to comment.