Skip to content

Commit

Permalink
BUG: Fix issue with inserting duplicate columns in a dataframe
Browse files Browse the repository at this point in the history
closes #14291
closes #14431
  • Loading branch information
paul-mannino authored and jreback committed Oct 24, 2016
1 parent 5cf6d94 commit 2e77536
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 8 deletions.
8 changes: 8 additions & 0 deletions doc/source/whatsnew/v0.19.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ Bug Fixes


- Source installs from PyPI will now work without ``cython`` installed, as in previous versions (:issue:`14204`)
- Bug in ``DataFrame.insert`` where multiple calls with duplicate columns can fail (:issue:`14291`)

- ``pd.merge()`` will raise ``ValueError`` with non-boolean parameters in passed boolean type arguments (:issue:`14434`)

Expand All @@ -63,4 +64,11 @@ Bug Fixes
- Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`)
- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`)








- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`)
30 changes: 23 additions & 7 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2487,7 +2487,7 @@ def _set_item(self, key, value):

# check if we are modifying a copy
# try to set first as we want an invalid
# value exeption to occur first
# value exception to occur first
if len(self):
self._check_setitem_copy()

Expand All @@ -2503,10 +2503,10 @@ def insert(self, loc, column, value, allow_duplicates=False):
loc : int
Must have 0 <= loc <= len(columns)
column : object
value : int, Series, or array-like
value : scalar, Series, or array-like
"""
self._ensure_valid_index(value)
value = self._sanitize_column(column, value)
value = self._sanitize_column(column, value, broadcast=False)
self._data.insert(loc, column, value,
allow_duplicates=allow_duplicates)

Expand Down Expand Up @@ -2590,9 +2590,25 @@ def assign(self, **kwargs):

return data

def _sanitize_column(self, key, value):
# Need to make sure new columns (which go into the BlockManager as new
# blocks) are always copied
def _sanitize_column(self, key, value, broadcast=True):
"""
Ensures new columns (which go into the BlockManager as new blocks) are
always copied and converted into an array.
Parameters
----------
key : object
value : scalar, Series, or array-like
broadcast : bool, default True
If ``key`` matches multiple duplicate column names in the
DataFrame, this parameter indicates whether ``value`` should be
tiled so that the returned array contains a (duplicated) column for
each occurrence of the key. If False, ``value`` will not be tiled.
Returns
-------
sanitized_column : numpy-array
"""

def reindexer(value):
# reindex if necessary
Expand Down Expand Up @@ -2665,7 +2681,7 @@ def reindexer(value):
return value

# broadcast across multiple columns if necessary
if key in self.columns and value.ndim == 1:
if broadcast and key in self.columns and value.ndim == 1:
if (not self.columns.is_unique or
isinstance(self.columns, MultiIndex)):
existing_piece = self[key]
Expand Down
16 changes: 15 additions & 1 deletion pandas/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,21 @@ def fillna(self, value=None, method=None, axis=0, inplace=False,
# ----------------------------------------------------------------------
# Support different internal representation of SparseDataFrame

def _sanitize_column(self, key, value):
def _sanitize_column(self, key, value, **kwargs):
"""
Creates a new SparseArray from the input value.
Parameters
----------
key : object
value : scalar, Series, or array-like
kwargs : dict
Returns
-------
sanitized_column : SparseArray
"""
sp_maker = lambda x, index=None: SparseArray(
x, index=index, fill_value=self._default_fill_value,
kind=self._default_kind)
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/frame/test_nonunique_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,3 +468,13 @@ def test_set_value_by_index(self):

df.iloc[:, 0] = 3
assert_series_equal(df.iloc[:, 1], expected)

def test_insert_with_columns_dups(self):
# GH 14291
df = pd.DataFrame()
df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True)
df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True)
df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True)
exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'],
['c', 'f', 'i']], columns=['A', 'A', 'A'])
assert_frame_equal(df, exp)

0 comments on commit 2e77536

Please sign in to comment.