diff --git a/RELEASE.rst b/RELEASE.rst index 6bc55b030be19..2d672b980cefc 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -10,11 +10,32 @@ pandas 0.4.4 **Release date:** not yet released +**New features / modules** + + - Added `parse_dates` option to `read_csv` and `read_table` methods to + optionally try to parse dates in the index columns + - Added ability to join on multiple columns in `DataFrame.join` (GH #214) + +**API Changes** + + - `read_table`, `read_csv`, and `ExcelFile.parse` default arguments for + `index_col` is now None. To use one or more of the columns as the resulting + DataFrame's index, these must be explicitly specified now + - Parsing functions no longer parse dates by default (GH #225) + **Improvements to existing features** - Refactored merging / joining code into a tidy class and disabled unnecessary computations in the float/object case, thus getting about 10% better performance + - Improved speed of `DataFrame.xs` on mixed-type DataFrame objects by about + 5x, regression from 0.3.0 + +**Bug fixes** + + - Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should + be reported upstream to matplotlib (GH #224) + pandas 0.4.3 ============ diff --git a/pandas/core/common.py b/pandas/core/common.py index fd2863735d0bf..cadda5d432d39 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -518,6 +518,9 @@ def _asarray_tuplesafe(values, dtype=None): if not isinstance(values, (list, tuple, np.ndarray)): values = list(values) + if isinstance(values, list) and dtype == np.object_: + return lib.list_to_object_array(values) + result = np.asarray(values, dtype=dtype) if issubclass(result.dtype.type, basestring): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 125120d2ecb41..29e2cc6e560f0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -430,7 +430,8 @@ def to_records(self, index=True): return np.rec.fromarrays(arrays, names=names) @classmethod - def from_csv(cls, path, header=0, delimiter=',', index_col=0): + def from_csv(cls, path, header=0, delimiter=',', index_col=0, + parse_dates=True): """ Read delimited file into DataFrame @@ -447,16 +448,15 @@ def from_csv(cls, path, header=0, delimiter=',', index_col=0): Notes ----- Will attempt to convert index to datetimes for time series - data. Use read_csv for more options + data. Use read_table for more options Returns ------- y : DataFrame or DataFrame """ from pandas.io.parsers import read_table - df = read_table(path, header=header, sep=delimiter, - index_col=index_col) - return df + return read_table(path, header=header, sep=delimiter, + parse_dates=parse_dates, index_col=index_col) def to_sparse(self, fill_value=None, kind='block'): """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1ec552555036f..e3a30a8621c86 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -9,36 +9,10 @@ from pandas.core.index import Index, MultiIndex from pandas.core.frame import DataFrame -def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0, - na_values=None, date_parser=None, names=None): - """ - Read CSV file into DataFrame - Parameters - ---------- - filepath_or_buffer : string or file handle / StringIO - sep : string, default None - Delimiter to use. By default will try to automatically determine - this - header : int, default 0 - Row to use for the column labels of the parsed DataFrame - skiprows : list-like - Row numbers to skip (0-indexed) - index_col : int or sequence., default 0 - Column to use as the row labels of the DataFrame. Pass None if there is - no such column. If a sequence is given, a MultiIndex is used. - na_values : list-like, default None - List of additional strings to recognize as NA/NaN - date_parser : function - Function to use for converting dates to strings. Defaults to - dateutil.parser - names : array-like - List of column names - - Returns - ------- - parsed : DataFrame - """ +def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None, + skiprows=None, na_values=None, parse_dates=False, + date_parser=None): import csv if hasattr(filepath_or_buffer, 'read'): @@ -71,43 +45,77 @@ def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0, else: lines = [l for l in reader] f.close() - return _simple_parser(lines, header=header, indexCol=index_col, - colNames=names, na_values=na_values, - date_parser=date_parser) -def read_table(filepath_or_buffer, sep='\t', header=0, skiprows=None, - index_col=0, na_values=None, date_parser=None, names=None): - """ - Read delimited file into DataFrame + if date_parser is not None: + parse_dates = True - Parameters - ---------- - filepath_or_buffer : string or file handle - sep : string, default '\t' - Delimiter to use - header : int, default 0 - Row to use for the column labels of the parsed DataFrame - skiprows : list-like - Row numbers to skip (0-indexed) - index_col : int or sequence, default 0 - Column to use as the row labels of the DataFrame. Pass None if there is - no such column. If a sequence is given, a MultiIndex is used. - na_values : list-like, default None - List of additional strings to recognize as NA/NaN - date_parser : function - Function to use for converting dates to strings. Defaults to - dateutil.parser - names : array-like - List of column names - - Returns - ------- - parsed : DataFrame - """ - return read_csv(filepath_or_buffer, sep, header, skiprows, - index_col, na_values, date_parser, names) + return _simple_parser(lines, + header=header, + index_col=index_col, + colNames=names, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser) -def _simple_parser(lines, colNames=None, header=0, indexCol=0, + +def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None, + names=None, skiprows=None, na_values=None, parse_dates=False, + date_parser=None): + return read_csv(filepath_or_buffer, sep=sep, header=header, + skiprows=skiprows, index_col=index_col, + na_values=na_values, date_parser=date_parser, + names=names, parse_dates=parse_dates) + +_parser_params = """Parameters +---------- +filepath_or_buffer : string or file handle / StringIO +%s +header : int, default 0 + Row to use for the column labels of the parsed DataFrame +skiprows : list-like + Row numbers to skip (0-indexed) +index_col : int or sequence, default None + Column to use as the row labels of the DataFrame. If a sequence is + given, a MultiIndex is used. +na_values : list-like, default None + List of additional strings to recognize as NA/NaN +parse_dates : boolean, default False + Attempt to parse dates in the index column(s) +date_parser : function + Function to use for converting dates to strings. Defaults to + dateutil.parser +names : array-like + List of column names""" + +_csv_sep = """sep : string, default None + Delimiter to use. By default will try to automatically determine + this""" + +_table_sep = """sep : string, default \\t (tab-stop) + Delimiter to use""" + +read_csv.__doc__ = """ +Read CSV (comma-separated) file into DataFrame + +%s + +Returns +------- +parsed : DataFrame +""" % (_parser_params % _csv_sep) + +read_table.__doc__ = """ +Read delimited file into DataFrame + +%s + +Returns +------- +parsed : DataFrame +""" % (_parser_params % _table_sep) + + +def _simple_parser(lines, colNames=None, header=0, index_col=0, na_values=None, date_parser=None, parse_dates=True): """ Workhorse function for processing nested list into DataFrame @@ -142,30 +150,48 @@ def _simple_parser(lines, colNames=None, header=0, indexCol=0, zipped_content = zip(*content) if len(content) == 0: # pragma: no cover - raise Exception('No content to parse') + if index_col is not None: + if np.isscalar(index_col): + index = Index([], name=columns.pop(index_col)) + else: + cp_cols = list(columns) + names = [] + for i in index_col: + name = cp_cols[i] + columns.remove(name) + names.append(name) + index = MultiIndex.fromarrays([[]] * len(index_col), + names=names) + else: + index = Index([]) + + return DataFrame(index=index, columns=columns) + + if index_col is None and len(content[0]) == len(columns) + 1: + index_col = 0 # no index column specified, so infer that's what is wanted - if indexCol is not None: - if np.isscalar(indexCol): - if indexCol == 0 and len(content[0]) == len(columns) + 1: + if index_col is not None: + if np.isscalar(index_col): + if index_col == 0 and len(content[0]) == len(columns) + 1: index = zipped_content[0] zipped_content = zipped_content[1:] else: - index = zipped_content.pop(indexCol) - columns.pop(indexCol) + index = zipped_content.pop(index_col) + columns.pop(index_col) else: # given a list of index idx_names = [] index = [] - for idx in indexCol: + for idx in index_col: idx_names.append(columns[idx]) index.append(zipped_content[idx]) #remove index items from content and columns, don't pop in loop - for i in range(len(indexCol)): + for i in range(len(index_col)): columns.remove(idx_names[i]) zipped_content.remove(index[i]) - if np.isscalar(indexCol): + if np.isscalar(index_col): if parse_dates: index = _try_parse_dates(index, parser=date_parser) index = Index(_maybe_convert_int(np.array(index, dtype=object))) @@ -232,9 +258,6 @@ def _maybe_convert_int(arr): return arr def _maybe_convert_int_mindex(index, parse_dates, date_parser): - if len(index) == 0: - return index - for i in range(len(index)): try: int(index[i][0]) @@ -298,8 +321,8 @@ def __init__(self, path): def __repr__(self): return object.__repr__(self) - def parse(self, sheetname, header=0, skiprows=None, index_col=0, - na_values=None): + def parse(self, sheetname, header=0, skiprows=None, index_col=None, + parse_dates=False, date_parser=None, na_values=None): """ Read Excel table into DataFrame @@ -348,7 +371,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=0, value = datetime(*dt) row.append(value) data.append(row) - return _simple_parser(data, header=header, indexCol=index_col, + return _simple_parser(data, header=header, index_col=index_col, + parse_dates=parse_dates, date_parser=date_parser, na_values=na_values) #------------------------------------------------------------------------------- @@ -363,7 +387,8 @@ def parseCSV(filepath, header=0, skiprows=None, indexCol=0, """ warnings.warn("parseCSV is deprecated. Use read_csv instead", FutureWarning) return read_csv(filepath, header=header, skiprows=skiprows, - index_col=indexCol, na_values=na_values) + index_col=indexCol, na_values=na_values, + parse_dates=True) def parseText(filepath, sep='\t', header=0, indexCol=0, colNames=None): # pragma: no cover @@ -374,7 +399,7 @@ def parseText(filepath, sep='\t', header=0, warnings.warn("parseText is deprecated. Use read_table instead", FutureWarning) return read_table(filepath, sep=sep, header=header, index_col=indexCol, - names=colNames) + names=colNames, parse_dates=True) def parseExcel(filepath, header=None, indexCol=0, @@ -385,6 +410,7 @@ def parseExcel(filepath, header=None, indexCol=0, warnings.warn("parseExcel is deprecated. Use the ExcelFile class instead", FutureWarning) excel_file = ExcelFile(filepath) - return excel_file.parse(sheetname, header=header, index_col=indexCol) + return excel_file.parse(sheetname, header=header, index_col=indexCol, + parse_dates=True) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index e6cfce6f32cb3..f4049bf3adcbe 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -34,12 +34,11 @@ def test_custom_na_values(self): [nan, 5, nan], [7, 8, nan]] - df = read_csv(StringIO(data), index_col=None, na_values=['baz'], - skiprows=[1]) + df = read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) assert_almost_equal(df.values, expected) - df2 = read_table(StringIO(data), sep=',', index_col=None, - na_values=['baz'], skiprows=[1]) + df2 = read_table(StringIO(data), sep=',', na_values=['baz'], + skiprows=[1]) assert_almost_equal(df2.values, expected) def test_unnamed_columns(self): @@ -51,7 +50,7 @@ def test_unnamed_columns(self): expected = [[1,2,3,4,5.], [6,7,8,9,10], [11,12,13,14,15]] - df = read_table(StringIO(data), sep=',', index_col=None) + df = read_table(StringIO(data), sep=',') assert_almost_equal(df.values, expected) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'Unnamed: 3', @@ -84,7 +83,7 @@ def test_csv_custom_parser(self): """ df = read_csv(StringIO(data), date_parser=lambda x: datetime.strptime(x, '%Y%m%d')) - expected = read_csv(StringIO(data)) + expected = read_csv(StringIO(data), parse_dates=True) assert_frame_equal(df, expected) def test_no_header(self): @@ -92,11 +91,9 @@ def test_no_header(self): 6,7,8,9,10 11,12,13,14,15 """ - df = read_table(StringIO(data), sep=',', index_col=None, - header=None) + df = read_table(StringIO(data), sep=',', header=None) names = ['foo', 'bar', 'baz', 'quux', 'panda'] - df2 = read_table(StringIO(data), sep=',', index_col=None, - header=None, names=names) + df2 = read_table(StringIO(data), sep=',', header=None, names=names) expected = [[1,2,3,4,5.], [6,7,8,9,10], [11,12,13,14,15]] @@ -106,16 +103,16 @@ def test_no_header(self): self.assert_(np.array_equal(df2.columns, names)) def test_read_csv_dataframe(self): - df = read_csv(self.csv1) - df2 = read_table(self.csv1, sep=',') + df = read_csv(self.csv1, index_col=0, parse_dates=True) + df2 = read_table(self.csv1, sep=',', index_col=0, parse_dates=True) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D'])) self.assert_(isinstance(df.index[0], datetime)) self.assert_(df.values.dtype == np.float64) assert_frame_equal(df, df2) def test_read_csv_no_index_name(self): - df = read_csv(self.csv2) - df2 = read_table(self.csv2, sep=',') + df = read_csv(self.csv2, index_col=0, parse_dates=True) + df2 = read_table(self.csv2, sep=',', index_col=0, parse_dates=True) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D', 'E'])) self.assert_(isinstance(df.index[0], datetime)) self.assert_(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype == np.float64) @@ -129,12 +126,22 @@ def test_excel_table(self): pth = os.path.join(self.dirpath, 'test.xls') xls = ExcelFile(pth) - df = xls.parse('Sheet1') - df2 = read_csv(self.csv1) - df3 = xls.parse('Sheet2', skiprows=[1]) + df = xls.parse('Sheet1', index_col=0, parse_dates=True) + df2 = read_csv(self.csv1, index_col=0, parse_dates=True) + df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) assert_frame_equal(df, df2) assert_frame_equal(df3, df2) + def test_read_table_wrong_num_columns(self): + data = """A,B,C,D,E,F +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + self.assertRaises(Exception, read_csv, StringIO(data)) + + + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) return pth diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index ed65a9b39e660..50a8a8cf1e752 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -257,6 +257,23 @@ def isnullobj(ndarray input): return result +def list_to_object_array(list obj): + ''' + Convert list to object ndarray. Seriously can't believe I had to write this + function + ''' + cdef: + Py_ssize_t i, n + ndarray[object] arr + + n = len(obj) + arr = np.empty(n, dtype=object) + + for i from 0 <= i < n: + arr[i] = obj[i] + + return arr + include "skiplist.pyx" include "groupby.pyx" include "moments.pyx" diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index cf64e5a5d377f..79fa30e22885c 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1589,6 +1589,11 @@ def test_to_csv_multiindex(self): os.remove(path) + # empty + tsframe[:0].to_csv(path) + recons = DataFrame.from_csv(path) + assert_frame_equal(recons, tsframe[:0]) + def test_to_csv_float32_nanrep(self): df = DataFrame(np.random.randn(1, 4).astype(np.float32)) df[1] = np.nan