Skip to content

Commit

Permalink
Fix broken XLSX parsing. Closes #285.
Browse files Browse the repository at this point in the history
  • Loading branch information
onyxfish committed May 31, 2014
1 parent 88480fe commit d40e9b1
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 7 deletions.
24 changes: 21 additions & 3 deletions csvkit/convert/xlsx.py
Expand Up @@ -21,6 +21,19 @@ def normalize_datetime(dt):

return dt

def has_time_elements(cell):
"""
Try to use formatting to determine if a cell contains only time info.
See: http://office.microsoft.com/en-us/excel-help/number-format-codes-HP005198679.aspx
"""
if 'h' in cell.number_format or \
'H' in cell.number_format or \
'hh' in cell.number_format:
return True

return False

def xlsx2csv(f, output=None, **kwargs):
"""
Convert an Excel .xlsx file to csv.
Expand All @@ -36,23 +49,28 @@ def xlsx2csv(f, output=None, **kwargs):
writer = CSVKitWriter(output)

book = load_workbook(f, use_iterators=True, data_only=True)

if 'sheet' in kwargs:
sheet = book.get_sheet_by_name(kwargs['sheet'])
else:
sheet = book.get_active_sheet()

for i, row in enumerate(sheet.iter_rows()):
if i == 0:
writer.writerow([c.internal_value for c in row])
writer.writerow([c.value for c in row])
continue

out_row = []

for c in row:
value = c.internal_value
value = c.value

if value.__class__ is datetime.datetime:
if value.time() != NULL_TIME:
if value.time() != NULL_TIME or has_time_elements(c):
# Handle default XLSX date as 00:00 time
if value.date() == datetime.date(1904, 1, 1):
value = value.time()

value = normalize_datetime(value)
else:
value = value.date()
Expand Down
2 changes: 1 addition & 1 deletion examples/sheetsxlsx_converted.csv
@@ -1,5 +1,5 @@
text,date,integer,boolean,float,time,datetime,empty_column,
Chicago Tribune,1920-01-01,164,False,41800000.01,00:00:00,1920-01-01,,
Chicago Tribune,1920-01-01,164,False,41800000.01,00:00:00,1920-01-01T00:00:00,,
Chicago Sun-Times,1948-01-01,63,True,1.27,14:57:13,1948-01-01T14:57:13,,Extra data past headers will be trimmed
Chicago Reader,1971-01-01,40,True,1,04:14:00,1971-01-01T04:14:00,,
This row has blanks,,,,,,,,
Expand Down
2 changes: 1 addition & 1 deletion examples/testxlsx_converted.csv
@@ -1,6 +1,6 @@
text,date,integer,boolean,float,time,datetime,empty_column,
Chicago Reader,1971-01-01,40,True,1,04:14:00,1971-01-01T04:14:00,,
Chicago Sun-Times,1948-01-01,63,True,1.27,14:57:13,1948-01-01T14:57:13,,Extra data beyond headers will be trimmed
Chicago Tribune,1920-01-01,164,False,41800000.01,00:00:00,1920-01-01,,
Chicago Tribune,1920-01-01,164,False,41800000.01,00:00:00,1920-01-01T00:00:00,,
This row has blanks,,,,,,,,
Unicode! Σ,,,,,,,,
2 changes: 1 addition & 1 deletion requirements.txt
Expand Up @@ -5,7 +5,7 @@ python-dateutil>=1.5
SQLAlchemy>=0.9.3
sphinx>=1.0.7
coverage>=3.5.1b1
openpyxl>=1.8.4
openpyxl>=2.0.3
tox>=1.3
dbf==0.95.004
unittest2==0.5.1
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -7,7 +7,7 @@
'xlrd>=0.7.1',
'python-dateutil>=1.5',
'sqlalchemy>=0.6.6',
'openpyxl>=1.5.7',
'openpyxl>=2.0.3',
'dbf==0.94.003']

if sys.version_info < (2, 7):
Expand Down

0 comments on commit d40e9b1

Please sign in to comment.