Skip to content

Added support for read_fwf() as in R. #952

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 2, 2012
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 165 additions & 39 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,50 +74,54 @@
%s
""" % (_parser_params % _csv_sep)

_read_csv_doc = """
Read CSV (comma-separated) file into DataFrame

%s
""" % (_parser_params % _csv_sep)

_read_table_doc = """
Read general delimited file into DataFrame

%s
""" % (_parser_params % _table_sep)

@Appender(_read_csv_doc)
def read_csv(filepath_or_buffer, sep=',', header=0, index_col=None, names=None,
skiprows=None, na_values=None, parse_dates=False,
date_parser=None, nrows=None, iterator=False, chunksize=None,
skip_footer=0, converters=None, verbose=False, delimiter=None,
encoding=None):

_fwf_widths = """\
colspecs : a list of pairs (tuples), giving the extents
of the fixed-width fields of each line as half-open internals
(i.e., [from, to[ ).
widths : a list of field widths, which can be used instead of
'colspecs' if the intervals are contiguous.
"""

_read_fwf_doc = """
Read a table of fixed-width formatted lines into DataFrame

%s

Also, 'delimiter' is used to specify the filler character of the
fields if it is not spaces (e.g., '~').
""" % (_parser_params % _fwf_widths)


def _read(cls, filepath_or_buffer, kwds):
"Generic reader of line files."
if hasattr(filepath_or_buffer, 'read'):
f = filepath_or_buffer
else:
encoding = kwds.get('encoding', None)
try:
# universal newline mode
f = com._get_handle(filepath_or_buffer, 'U', encoding=encoding)
except Exception: # pragma: no cover
f = com._get_handle(filepath_or_buffer, 'r', encoding=encoding)

if delimiter is not None:
sep = delimiter
if kwds.get('date_parser', None) is not None:
kwds['parse_dates'] = True

if date_parser is not None:
parse_dates = True
# Extract some of the arguments (pass chunksize on).
kwds.pop('filepath_or_buffer')
iterator = kwds.pop('iterator')
nrows = kwds.pop('nrows')
chunksize = kwds.get('chunksize', None)

parser = TextParser(f, header=header, index_col=index_col,
names=names, na_values=na_values,
parse_dates=parse_dates,
date_parser=date_parser,
skiprows=skiprows,
delimiter=sep,
chunksize=chunksize,
skip_footer=skip_footer,
converters=converters,
verbose=verbose,
encoding=encoding)
# Create the parser.
parser = cls(f, **kwds)

if nrows is not None:
return parser.get_chunk(nrows)
Expand All @@ -126,19 +130,102 @@ def read_csv(filepath_or_buffer, sep=',', header=0, index_col=None, names=None,

return parser.get_chunk()

@Appender(_read_csv_doc)
def read_csv(filepath_or_buffer,
sep=',',
header=0,
index_col=None,
names=None,
skiprows=None,
na_values=None,
parse_dates=False,
date_parser=None,
nrows=None,
iterator=False,
chunksize=None,
skip_footer=0,
converters=None,
verbose=False,
delimiter=None,
encoding=None):
kwds = locals()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not wild on the use of locals() for these, it seems like unnecessary magic. But maybe I'm being overly picky.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the alternative here is to either write out kw=kw for each keyword argument or to have **kwds which makes the signature in IPython less attractive. Not sure what's the best solution-- using locals doesn't strike me as so bad

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having to enumerate all the paramters is both error-prone, makes it difficult to extend the other functions, and it hides the differences between the calls to _read(). I wish there was a method to get just the args, but there isn't.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed. I think PEP 362 is aimed at this sort of thing - you'd use **kwargs, and construct a more meaningful function signature for introspection - but that's still a work in progress.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally it would be like LISP and a :as variable could be assigned to the set of kwargs. But it's Python. Whatever. We'll eventually end up with LISP again.


# Alias sep -> delimiter.
sep = kwds.pop('sep')
if kwds.get('delimiter', None) is None:
kwds['delimiter'] = sep

return _read(TextParser, filepath_or_buffer, kwds)

@Appender(_read_table_doc)
def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None,
names=None, skiprows=None, na_values=None, parse_dates=False,
date_parser=None, nrows=None, iterator=False, chunksize=None,
skip_footer=0, converters=None, verbose=False, delimiter=None,
def read_table(filepath_or_buffer,
sep='\t',
header=0,
index_col=None,
names=None,
skiprows=None,
na_values=None,
parse_dates=False,
date_parser=None,
nrows=None,
iterator=False,
chunksize=None,
skip_footer=0,
converters=None,
verbose=False,
delimiter=None,
encoding=None):
return read_csv(filepath_or_buffer, sep=sep, header=header,
skiprows=skiprows, index_col=index_col,
na_values=na_values, date_parser=date_parser,
names=names, parse_dates=parse_dates,
nrows=nrows, iterator=iterator, chunksize=chunksize,
skip_footer=skip_footer, converters=converters,
verbose=verbose, delimiter=delimiter, encoding=None)
kwds = locals()

# Alias sep -> delimiter.
sep = kwds.pop('sep')
if kwds.get('delimiter', None) is None:
kwds['delimiter'] = sep

# Override as default encoding.
kwds['encoding'] = None

return _read(TextParser, filepath_or_buffer, kwds)

@Appender(_read_fwf_doc)
def read_fwf(filepath_or_buffer,
colspecs=None,
widths=None,
header=0,
index_col=None,
names=None,
skiprows=None,
na_values=None,
parse_dates=False,
date_parser=None,
nrows=None,
iterator=False,
chunksize=None,
skip_footer=0,
converters=None,
delimiter=None,
verbose=False,
encoding=None):

kwds = locals()

# Check input arguments.
colspecs = kwds.get('colspecs', None)
widths = kwds.pop('widths', None)
if bool(colspecs is None) == bool(widths is None):
raise ValueError("You must specify only one of 'widths' and 'colspecs'")

# Compute 'colspec' from 'widths', if specified.
if widths is not None:
colspecs, col = [], 0
for w in widths:
colspecs.append( (col, col+w) )
col += w
kwds['colspecs'] = colspecs

return _read(FixedWidthFieldParser, filepath_or_buffer, kwds)



def read_clipboard(**kwargs): # pragma: no cover
"""
Expand Down Expand Up @@ -188,6 +275,7 @@ class TextParser(object):
Parameters
----------
data : file-like object or list
delimiter : separator character to use
names : sequence, default
header : int, default 0
Row to use to parse column labels. Defaults to the first row. Prior
Expand Down Expand Up @@ -573,6 +661,44 @@ def _convert_types(values, na_values):

return result, na_count


class FixedWidthReader(object):
"""
A reader of fixed-width lines.
"""
def __init__(self, f, colspecs, filler):
self.f = f
self.colspecs = colspecs
self.filler = filler # Empty characters between fields.

assert isinstance(colspecs, (tuple, list))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd be less strict with this first check - there might be situations where you want to pass a generator rather than a list. I'd do self.colspecs = list(colspecs), and let it handle anything that can be turned into a list.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea. WIll do.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

for colspec in colspecs:
assert isinstance(colspec, (tuple, list))
assert len(colspec) == 2
assert isinstance(colspec[0], int)
assert isinstance(colspec[1], int)

def next(self):
line = self.f.next()
# Note: 'colspecs' is a sequence of half-open intervals.
return [line[fromm:to].strip(self.filler or ' ') for (fromm, to) in self.colspecs]


class FixedWidthFieldParser(TextParser):
"""
Specialization that Converts fixed-width fields into DataFrames.
See TextParser for details.
"""
def __init__(self, f, **kwds):
# Support iterators, convert to a list.
self.colspecs = list(kwds.pop('colspecs'))

TextParser.__init__(self, f, **kwds)

def _make_reader(self, f):
self.data = FixedWidthReader(f, self.colspecs, self.delimiter)


#-------------------------------------------------------------------------------
# ExcelFile class

Expand Down Expand Up @@ -793,7 +919,7 @@ def _writerow_xlsx(self, row, sheet_name):
sheet = self.book.create_sheet()
sheet.title = sheet_name
row_idx = 0

conv_row = []
for val in row:
if isinstance(val, np.int64):
Expand Down
47 changes: 46 additions & 1 deletion pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import numpy as np

from pandas import DataFrame, Index, isnull
from pandas.io.parsers import read_csv, read_table, ExcelFile, TextParser
from pandas.io.parsers import read_csv, read_table, read_fwf, ExcelFile, TextParser
from pandas.util.testing import assert_almost_equal, assert_frame_equal
import pandas._tseries as lib
from pandas.util import py3compat
Expand Down Expand Up @@ -719,6 +719,51 @@ def convert_score(x):
na_values=[-1,'',None])
assert_frame_equal(result, result2)

def test_fwf(self):
data_expected = """\
2011,58,360.242940,149.910199,11950.7
2011,59,444.953632,166.985655,11788.4
2011,60,364.136849,183.628767,11806.2
2011,61,413.836124,184.375703,11916.8
2011,62,502.953953,173.237159,12468.3
"""
expected = read_csv(StringIO(data_expected), header=None)

data1 = """\
201158 360.242940 149.910199 11950.7
201159 444.953632 166.985655 11788.4
201160 364.136849 183.628767 11806.2
201161 413.836124 184.375703 11916.8
201162 502.953953 173.237159 12468.3
"""
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
df = read_fwf(StringIO(data1), colspecs=colspecs, header=None)
assert_frame_equal(df, expected)

data2 = """\
2011 58 360.242940 149.910199 11950.7
2011 59 444.953632 166.985655 11788.4
2011 60 364.136849 183.628767 11806.2
2011 61 413.836124 184.375703 11916.8
2011 62 502.953953 173.237159 12468.3
"""
df = read_fwf(StringIO(data2), widths=[5, 5, 13, 13, 7], header=None)
assert_frame_equal(df, expected)

# From Thomas Kluyver: apparently some non-space filler characters can
# be seen, this is supported by specifying the 'delimiter' character:
# http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
data3 = """\
201158~~~~360.242940~~~149.910199~~~11950.7
201159~~~~444.953632~~~166.985655~~~11788.4
201160~~~~364.136849~~~183.628767~~~11806.2
201161~~~~413.836124~~~184.375703~~~11916.8
201162~~~~502.953953~~~173.237159~~~12468.3
"""
df = read_fwf(StringIO(data3), colspecs=colspecs, delimiter='~', header=None)
assert_frame_equal(df, expected)


class TestParseSQL(unittest.TestCase):

def test_convert_sql_column_floats(self):
Expand Down