pandas-dev · wesm · Apr 2, 2012 · Mar 22, 2012 · Mar 24, 2012 · Mar 24, 2012
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -74,50 +74,54 @@
 %s
 """ % (_parser_params % _csv_sep)
 
-_read_csv_doc = """
-Read CSV (comma-separated) file into DataFrame
-
-%s
-""" % (_parser_params % _csv_sep)
-
 _read_table_doc = """
 Read general delimited file into DataFrame
 
 %s
 """ % (_parser_params % _table_sep)
 
-@Appender(_read_csv_doc)
-def read_csv(filepath_or_buffer, sep=',', header=0, index_col=None, names=None,
-             skiprows=None, na_values=None, parse_dates=False,
-             date_parser=None, nrows=None, iterator=False, chunksize=None,
-             skip_footer=0, converters=None, verbose=False, delimiter=None,
-             encoding=None):
+
+_fwf_widths = """\
+colspecs : a list of pairs (tuples), giving the extents
+    of the fixed-width fields of each line as half-open internals
+    (i.e.,  [from, to[  ).
+widths : a list of field widths, which can be used instead of
+    'colspecs' if the intervals are contiguous.
+"""
+
+_read_fwf_doc = """
+Read a table of fixed-width formatted lines into DataFrame
+
+%s
+
+Also, 'delimiter' is used to specify the filler character of the
+fields if it is not spaces (e.g., '~').
+""" % (_parser_params % _fwf_widths)
+
+
+def _read(cls, filepath_or_buffer, kwds):
+    "Generic reader of line files."
     if hasattr(filepath_or_buffer, 'read'):
         f = filepath_or_buffer
     else:
+        encoding = kwds.get('encoding', None)
         try:
             # universal newline mode
             f = com._get_handle(filepath_or_buffer, 'U', encoding=encoding)
         except Exception: # pragma: no cover
             f = com._get_handle(filepath_or_buffer, 'r', encoding=encoding)
 
-    if delimiter is not None:
-        sep = delimiter
+    if kwds.get('date_parser', None) is not None:
+        kwds['parse_dates'] = True
 
-    if date_parser is not None:
-        parse_dates = True
+    # Extract some of the arguments (pass chunksize on).
+    kwds.pop('filepath_or_buffer')
+    iterator = kwds.pop('iterator')
+    nrows = kwds.pop('nrows')
+    chunksize = kwds.get('chunksize', None)
 
-    parser = TextParser(f, header=header, index_col=index_col,
-                        names=names, na_values=na_values,
-                        parse_dates=parse_dates,
-                        date_parser=date_parser,
-                        skiprows=skiprows,
-                        delimiter=sep,
-                        chunksize=chunksize,
-                        skip_footer=skip_footer,
-                        converters=converters,
-                        verbose=verbose,
-                        encoding=encoding)
+    # Create the parser.
+    parser = cls(f, **kwds)
 
     if nrows is not None:
         return parser.get_chunk(nrows)
@@ -126,19 +130,102 @@ def read_csv(filepath_or_buffer, sep=',', header=0, index_col=None, names=None,
 
     return parser.get_chunk()
 
+@Appender(_read_csv_doc)
+def read_csv(filepath_or_buffer,
+             sep=',',
+             header=0,
+             index_col=None,
+             names=None,
+             skiprows=None,
+             na_values=None,
+             parse_dates=False,
+             date_parser=None,
+             nrows=None,
+             iterator=False,
+             chunksize=None,
+             skip_footer=0,
+             converters=None,
+             verbose=False,
+             delimiter=None,
+             encoding=None):
+    kwds = locals()
+
+    # Alias sep -> delimiter.
+    sep = kwds.pop('sep')
+    if kwds.get('delimiter', None) is None:
+        kwds['delimiter'] = sep
+
+    return _read(TextParser, filepath_or_buffer, kwds)
+
 @Appender(_read_table_doc)
-def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None,
-               names=None, skiprows=None, na_values=None, parse_dates=False,
-               date_parser=None, nrows=None, iterator=False, chunksize=None,
-               skip_footer=0, converters=None, verbose=False, delimiter=None,
+def read_table(filepath_or_buffer,
+               sep='\t',
+               header=0,
+               index_col=None,
+               names=None,
+               skiprows=None,
+               na_values=None,
+               parse_dates=False,
+               date_parser=None,
+               nrows=None,
+               iterator=False,
+               chunksize=None,
+               skip_footer=0,
+               converters=None,
+               verbose=False,
+               delimiter=None,
                encoding=None):
-    return read_csv(filepath_or_buffer, sep=sep, header=header,
-                    skiprows=skiprows, index_col=index_col,
-                    na_values=na_values, date_parser=date_parser,
-                    names=names, parse_dates=parse_dates,
-                    nrows=nrows, iterator=iterator, chunksize=chunksize,
-                    skip_footer=skip_footer, converters=converters,
-                    verbose=verbose, delimiter=delimiter, encoding=None)
+    kwds = locals()
+
+    # Alias sep -> delimiter.
+    sep = kwds.pop('sep')
+    if kwds.get('delimiter', None) is None:
+        kwds['delimiter'] = sep
+
+    # Override as default encoding.
+    kwds['encoding'] = None
+
+    return _read(TextParser, filepath_or_buffer, kwds)
+
+@Appender(_read_fwf_doc)
+def read_fwf(filepath_or_buffer,
+             colspecs=None,
+             widths=None,
+             header=0,
+             index_col=None,
+             names=None,
+             skiprows=None,
+             na_values=None,
+             parse_dates=False,
+             date_parser=None,
+             nrows=None,
+             iterator=False,
+             chunksize=None,
+             skip_footer=0,
+             converters=None,
+             delimiter=None,
+             verbose=False,
+             encoding=None):
+
+    kwds = locals()
+
+    # Check input arguments.
+    colspecs = kwds.get('colspecs', None)
+    widths = kwds.pop('widths', None)
+    if bool(colspecs is None) == bool(widths is None):
+        raise ValueError("You must specify only one of 'widths' and 'colspecs'")
+
+    # Compute 'colspec' from 'widths', if specified.
+    if widths is not None:
+        colspecs, col = [], 0
+        for w in widths:
+            colspecs.append( (col, col+w) )
+            col += w
+        kwds['colspecs'] = colspecs
+
+    return _read(FixedWidthFieldParser, filepath_or_buffer, kwds)
+
+
 
 def read_clipboard(**kwargs):  # pragma: no cover
     """
@@ -188,6 +275,7 @@ class TextParser(object):
     Parameters
     ----------
     data : file-like object or list
+    delimiter : separator character to use
     names : sequence, default
     header : int, default 0
         Row to use to parse column labels. Defaults to the first row. Prior
@@ -573,6 +661,44 @@ def _convert_types(values, na_values):
 
     return result, na_count
 
+
+class FixedWidthReader(object):
+    """
+    A reader of fixed-width lines.
+    """
+    def __init__(self, f, colspecs, filler):
+        self.f = f
+        self.colspecs = colspecs
+        self.filler = filler # Empty characters between fields.
+
+        assert isinstance(colspecs, (tuple, list))
+        for colspec in colspecs:
+            assert isinstance(colspec, (tuple, list))
+            assert len(colspec) == 2
+            assert isinstance(colspec[0], int)
+            assert isinstance(colspec[1], int)
+
+    def next(self):
+        line = self.f.next()
+        # Note: 'colspecs' is a sequence of half-open intervals.
+        return [line[fromm:to].strip(self.filler or ' ') for (fromm, to) in self.colspecs]
+
+
+class FixedWidthFieldParser(TextParser):
+    """
+    Specialization that Converts fixed-width fields into DataFrames.
+    See TextParser for details.
+    """
+    def __init__(self, f, **kwds):
+        # Support iterators, convert to a list.
+        self.colspecs = list(kwds.pop('colspecs'))
+
+        TextParser.__init__(self, f, **kwds)
+
+    def _make_reader(self, f):
+        self.data = FixedWidthReader(f, self.colspecs, self.delimiter)
+
+
 #-------------------------------------------------------------------------------
 # ExcelFile class
 
@@ -793,7 +919,7 @@ def _writerow_xlsx(self, row, sheet_name):
             sheet = self.book.create_sheet()
             sheet.title = sheet_name
             row_idx = 0
-        
+
         conv_row = []
         for val in row:
             if isinstance(val, np.int64):

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from pandas import DataFrame, Index, isnull
-from pandas.io.parsers import read_csv, read_table, ExcelFile, TextParser
+from pandas.io.parsers import read_csv, read_table, read_fwf, ExcelFile, TextParser
 from pandas.util.testing import assert_almost_equal, assert_frame_equal
 import pandas._tseries as lib
 from pandas.util import py3compat
@@ -719,6 +719,51 @@ def convert_score(x):
                                   na_values=[-1,'',None])
         assert_frame_equal(result, result2)
 
+    def test_fwf(self):
+        data_expected = """\
+2011,58,360.242940,149.910199,11950.7
+2011,59,444.953632,166.985655,11788.4
+2011,60,364.136849,183.628767,11806.2
+2011,61,413.836124,184.375703,11916.8
+2011,62,502.953953,173.237159,12468.3
+"""
+        expected = read_csv(StringIO(data_expected), header=None)
+
+        data1 = """\
+201158    360.242940   149.910199   11950.7
+201159    444.953632   166.985655   11788.4
+201160    364.136849   183.628767   11806.2
+201161    413.836124   184.375703   11916.8
+201162    502.953953   173.237159   12468.3
+"""
+        colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
+        df = read_fwf(StringIO(data1), colspecs=colspecs, header=None)
+        assert_frame_equal(df, expected)
+
+        data2 = """\
+2011 58   360.242940   149.910199   11950.7
+2011 59   444.953632   166.985655   11788.4
+2011 60   364.136849   183.628767   11806.2
+2011 61   413.836124   184.375703   11916.8
+2011 62   502.953953   173.237159   12468.3
+"""
+        df = read_fwf(StringIO(data2), widths=[5, 5, 13, 13, 7], header=None)
+        assert_frame_equal(df, expected)
+
+        # From Thomas Kluyver: apparently some non-space filler characters can
+        # be seen, this is supported by specifying the 'delimiter' character:
+        # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
+        data3 = """\
+201158~~~~360.242940~~~149.910199~~~11950.7
+201159~~~~444.953632~~~166.985655~~~11788.4
+201160~~~~364.136849~~~183.628767~~~11806.2
+201161~~~~413.836124~~~184.375703~~~11916.8
+201162~~~~502.953953~~~173.237159~~~12468.3
+"""
+        df = read_fwf(StringIO(data3), colspecs=colspecs, delimiter='~', header=None)
+        assert_frame_equal(df, expected)
+
+
 class TestParseSQL(unittest.TestCase):
 
     def test_convert_sql_column_floats(self):