-
-
Notifications
You must be signed in to change notification settings - Fork 18.8k
Added support for read_fwf() as in R. #952
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
47e6b5c
42309d1
d044688
b2e08b5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -74,50 +74,54 @@ | |
%s | ||
""" % (_parser_params % _csv_sep) | ||
|
||
_read_csv_doc = """ | ||
Read CSV (comma-separated) file into DataFrame | ||
|
||
%s | ||
""" % (_parser_params % _csv_sep) | ||
|
||
_read_table_doc = """ | ||
Read general delimited file into DataFrame | ||
|
||
%s | ||
""" % (_parser_params % _table_sep) | ||
|
||
@Appender(_read_csv_doc) | ||
def read_csv(filepath_or_buffer, sep=',', header=0, index_col=None, names=None, | ||
skiprows=None, na_values=None, parse_dates=False, | ||
date_parser=None, nrows=None, iterator=False, chunksize=None, | ||
skip_footer=0, converters=None, verbose=False, delimiter=None, | ||
encoding=None): | ||
|
||
_fwf_widths = """\ | ||
colspecs : a list of pairs (tuples), giving the extents | ||
of the fixed-width fields of each line as half-open internals | ||
(i.e., [from, to[ ). | ||
widths : a list of field widths, which can be used instead of | ||
'colspecs' if the intervals are contiguous. | ||
""" | ||
|
||
_read_fwf_doc = """ | ||
Read a table of fixed-width formatted lines into DataFrame | ||
|
||
%s | ||
|
||
Also, 'delimiter' is used to specify the filler character of the | ||
fields if it is not spaces (e.g., '~'). | ||
""" % (_parser_params % _fwf_widths) | ||
|
||
|
||
def _read(cls, filepath_or_buffer, kwds): | ||
"Generic reader of line files." | ||
if hasattr(filepath_or_buffer, 'read'): | ||
f = filepath_or_buffer | ||
else: | ||
encoding = kwds.get('encoding', None) | ||
try: | ||
# universal newline mode | ||
f = com._get_handle(filepath_or_buffer, 'U', encoding=encoding) | ||
except Exception: # pragma: no cover | ||
f = com._get_handle(filepath_or_buffer, 'r', encoding=encoding) | ||
|
||
if delimiter is not None: | ||
sep = delimiter | ||
if kwds.get('date_parser', None) is not None: | ||
kwds['parse_dates'] = True | ||
|
||
if date_parser is not None: | ||
parse_dates = True | ||
# Extract some of the arguments (pass chunksize on). | ||
kwds.pop('filepath_or_buffer') | ||
iterator = kwds.pop('iterator') | ||
nrows = kwds.pop('nrows') | ||
chunksize = kwds.get('chunksize', None) | ||
|
||
parser = TextParser(f, header=header, index_col=index_col, | ||
names=names, na_values=na_values, | ||
parse_dates=parse_dates, | ||
date_parser=date_parser, | ||
skiprows=skiprows, | ||
delimiter=sep, | ||
chunksize=chunksize, | ||
skip_footer=skip_footer, | ||
converters=converters, | ||
verbose=verbose, | ||
encoding=encoding) | ||
# Create the parser. | ||
parser = cls(f, **kwds) | ||
|
||
if nrows is not None: | ||
return parser.get_chunk(nrows) | ||
|
@@ -126,19 +130,102 @@ def read_csv(filepath_or_buffer, sep=',', header=0, index_col=None, names=None, | |
|
||
return parser.get_chunk() | ||
|
||
@Appender(_read_csv_doc) | ||
def read_csv(filepath_or_buffer, | ||
sep=',', | ||
header=0, | ||
index_col=None, | ||
names=None, | ||
skiprows=None, | ||
na_values=None, | ||
parse_dates=False, | ||
date_parser=None, | ||
nrows=None, | ||
iterator=False, | ||
chunksize=None, | ||
skip_footer=0, | ||
converters=None, | ||
verbose=False, | ||
delimiter=None, | ||
encoding=None): | ||
kwds = locals() | ||
|
||
# Alias sep -> delimiter. | ||
sep = kwds.pop('sep') | ||
if kwds.get('delimiter', None) is None: | ||
kwds['delimiter'] = sep | ||
|
||
return _read(TextParser, filepath_or_buffer, kwds) | ||
|
||
@Appender(_read_table_doc) | ||
def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None, | ||
names=None, skiprows=None, na_values=None, parse_dates=False, | ||
date_parser=None, nrows=None, iterator=False, chunksize=None, | ||
skip_footer=0, converters=None, verbose=False, delimiter=None, | ||
def read_table(filepath_or_buffer, | ||
sep='\t', | ||
header=0, | ||
index_col=None, | ||
names=None, | ||
skiprows=None, | ||
na_values=None, | ||
parse_dates=False, | ||
date_parser=None, | ||
nrows=None, | ||
iterator=False, | ||
chunksize=None, | ||
skip_footer=0, | ||
converters=None, | ||
verbose=False, | ||
delimiter=None, | ||
encoding=None): | ||
return read_csv(filepath_or_buffer, sep=sep, header=header, | ||
skiprows=skiprows, index_col=index_col, | ||
na_values=na_values, date_parser=date_parser, | ||
names=names, parse_dates=parse_dates, | ||
nrows=nrows, iterator=iterator, chunksize=chunksize, | ||
skip_footer=skip_footer, converters=converters, | ||
verbose=verbose, delimiter=delimiter, encoding=None) | ||
kwds = locals() | ||
|
||
# Alias sep -> delimiter. | ||
sep = kwds.pop('sep') | ||
if kwds.get('delimiter', None) is None: | ||
kwds['delimiter'] = sep | ||
|
||
# Override as default encoding. | ||
kwds['encoding'] = None | ||
|
||
return _read(TextParser, filepath_or_buffer, kwds) | ||
|
||
@Appender(_read_fwf_doc) | ||
def read_fwf(filepath_or_buffer, | ||
colspecs=None, | ||
widths=None, | ||
header=0, | ||
index_col=None, | ||
names=None, | ||
skiprows=None, | ||
na_values=None, | ||
parse_dates=False, | ||
date_parser=None, | ||
nrows=None, | ||
iterator=False, | ||
chunksize=None, | ||
skip_footer=0, | ||
converters=None, | ||
delimiter=None, | ||
verbose=False, | ||
encoding=None): | ||
|
||
kwds = locals() | ||
|
||
# Check input arguments. | ||
colspecs = kwds.get('colspecs', None) | ||
widths = kwds.pop('widths', None) | ||
if bool(colspecs is None) == bool(widths is None): | ||
raise ValueError("You must specify only one of 'widths' and 'colspecs'") | ||
|
||
# Compute 'colspec' from 'widths', if specified. | ||
if widths is not None: | ||
colspecs, col = [], 0 | ||
for w in widths: | ||
colspecs.append( (col, col+w) ) | ||
col += w | ||
kwds['colspecs'] = colspecs | ||
|
||
return _read(FixedWidthFieldParser, filepath_or_buffer, kwds) | ||
|
||
|
||
|
||
def read_clipboard(**kwargs): # pragma: no cover | ||
""" | ||
|
@@ -188,6 +275,7 @@ class TextParser(object): | |
Parameters | ||
---------- | ||
data : file-like object or list | ||
delimiter : separator character to use | ||
names : sequence, default | ||
header : int, default 0 | ||
Row to use to parse column labels. Defaults to the first row. Prior | ||
|
@@ -573,6 +661,44 @@ def _convert_types(values, na_values): | |
|
||
return result, na_count | ||
|
||
|
||
class FixedWidthReader(object): | ||
""" | ||
A reader of fixed-width lines. | ||
""" | ||
def __init__(self, f, colspecs, filler): | ||
self.f = f | ||
self.colspecs = colspecs | ||
self.filler = filler # Empty characters between fields. | ||
|
||
assert isinstance(colspecs, (tuple, list)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd be less strict with this first check - there might be situations where you want to pass a generator rather than a list. I'd do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea. WIll do. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
for colspec in colspecs: | ||
assert isinstance(colspec, (tuple, list)) | ||
assert len(colspec) == 2 | ||
assert isinstance(colspec[0], int) | ||
assert isinstance(colspec[1], int) | ||
|
||
def next(self): | ||
line = self.f.next() | ||
# Note: 'colspecs' is a sequence of half-open intervals. | ||
return [line[fromm:to].strip(self.filler or ' ') for (fromm, to) in self.colspecs] | ||
|
||
|
||
class FixedWidthFieldParser(TextParser): | ||
""" | ||
Specialization that Converts fixed-width fields into DataFrames. | ||
See TextParser for details. | ||
""" | ||
def __init__(self, f, **kwds): | ||
# Support iterators, convert to a list. | ||
self.colspecs = list(kwds.pop('colspecs')) | ||
|
||
TextParser.__init__(self, f, **kwds) | ||
|
||
def _make_reader(self, f): | ||
self.data = FixedWidthReader(f, self.colspecs, self.delimiter) | ||
|
||
|
||
#------------------------------------------------------------------------------- | ||
# ExcelFile class | ||
|
||
|
@@ -793,7 +919,7 @@ def _writerow_xlsx(self, row, sheet_name): | |
sheet = self.book.create_sheet() | ||
sheet.title = sheet_name | ||
row_idx = 0 | ||
|
||
conv_row = [] | ||
for val in row: | ||
if isinstance(val, np.int64): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not wild on the use of
locals()
for these, it seems like unnecessary magic. But maybe I'm being overly picky.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, the alternative here is to either write out
kw=kw
for each keyword argument or to have**kwds
which makes the signature in IPython less attractive. Not sure what's the best solution-- using locals doesn't strike me as so badThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Having to enumerate all the paramters is both error-prone, makes it difficult to extend the other functions, and it hides the differences between the calls to _read(). I wish there was a method to get just the args, but there isn't.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed. I think PEP 362 is aimed at this sort of thing - you'd use
**kwargs
, and construct a more meaningful function signature for introspection - but that's still a work in progress.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ideally it would be like LISP and a :as variable could be assigned to the set of kwargs. But it's Python. Whatever. We'll eventually end up with LISP again.