Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

Issues #194 (cov function) and #92 (multi-column sort) #362

Closed
wants to merge 6 commits into
from
View
@@ -1491,11 +1491,7 @@ def sort(self, column=None, axis=0, ascending=True):
-------
sorted : DataFrame
"""
- by = None
- if column:
- assert(axis == 0)
- by = self[column].values
- return self.sort_index(by=by, axis=axis, ascending=ascending)
+ return self.sort_index(by=column, axis=axis, ascending=ascending)
def sort_index(self, axis=0, by=None, ascending=True):
"""
@@ -1507,7 +1503,7 @@ def sort_index(self, axis=0, by=None, ascending=True):
axis : {0, 1}
Sort index/rows versus columns
by : object
- Column name in frame
+ Column names in frame
ascending : boolean, default True
Sort ascending vs. descending
@@ -1516,17 +1512,20 @@ def sort_index(self, axis=0, by=None, ascending=True):
sorted : DataFrame
"""
labels = self._get_axis(axis)
-
+ order_list = None
if by is not None:
- try:
- if by in self.columns:
- assert(axis == 0)
- by = self[by].values
- except Exception:
- pass
-
- assert(len(by) == len(labels))
- sort_index = Series(by, index=labels).order().index
+ assert(axis == 0)
+ by = self[by]
+
+ if isinstance(by, Series):
+ assert(len(by) == len(labels))
+ by = by.values
+ sort_index = Series(by, index=labels).order().index
+ elif isinstance(by, DataFrame):
+ assert(len(by.index) == len(labels))
+ type_list = [(col_name, by[col_name].dtype) for col_name in by.columns]
+ sort_arr = np.array([tuple(r) for r in by.values], dtype=type_list)
+ sort_index = labels.take(sort_arr.argsort(order=by.columns.tolist()))
else:
sort_index = labels.take(labels.argsort())
@@ -2486,18 +2485,41 @@ def corr(self):
correl = baseCov / np.outer(sigma, sigma)
# Get the covariance with items that have NaN values
+ for i, j, ac, bc in self._cov_helper(mat):
+ c = np.corrcoef(ac, bc)[0, 1]
+ correl[i, j] = c
+ correl[j, i] = c
+
+ return self._constructor(correl, index=cols, columns=cols)
+
+ def cov(self):
+ """
+ Compute pairwise covariance of columns, excluding NA/null values
+
+ Returns
+ -------
+ y : DataFrame
+ """
+ cols = self.columns
+ mat = self.as_matrix(cols).T
+ baseCov = np.cov(mat)
+
+ for i, j, ac, bc in self._cov_helper(mat):
+ c = np.cov(ac, bc)[0, 1]
+ baseCov[i, j] = c
+ baseCov[j, i] = c
+
+ return self._constructor(baseCov, index=cols, columns=cols)
+
+ def _cov_helper(self, mat):
+ # Get the covariance with items that have NaN values
mask = np.isfinite(mat)
for i, A in enumerate(mat):
if not mask[i].all():
for j, B in enumerate(mat):
in_common = mask[i] & mask[j]
if in_common.any():
- ac, bc = A[in_common], B[in_common]
- c = np.corrcoef(ac, bc)[0, 1]
- correl[i, j] = c
- correl[j, i] = c
-
- return self._constructor(correl, index=cols, columns=cols)
+ yield i, j, A[in_common], B[in_common]
def corrwith(self, other, axis=0, drop=False):
"""
View
@@ -903,15 +903,40 @@ def corr(self, other):
-------
correlation : float
"""
+ this, that = self._get_nonna_aligned(other)
+ if this is None or that is None:
+ return nan
+ return np.corrcoef(this, that)[0, 1]
+
+ def cov(self, other):
+ """
+ Compute covariance with Series, excluding missing values
+
+ Parameters
+ ----------
+ other : Series
+
+ Returns
+ -------
+ covariance : float
+ """
+ this, that = self._get_nonna_aligned(other)
+ if this is None or that is None:
+ return nan
+ return np.cov(this, that)[0, 1]
+
+ def _get_nonna_aligned(self, other):
+ """
+ Returns two sub-Series with the same index and only non-na values
+ """
commonIdx = self.dropna().index.intersection(other.dropna().index)
@wesm

wesm Nov 13, 2011

Owner

You can also do:

this, that = self.dropna().align(other.dropna(), join='inner')

if len(commonIdx) == 0:
- return nan
+ return None, None
this = self.reindex(commonIdx)
that = other.reindex(commonIdx)
-
- return np.corrcoef(this, that)[0, 1]
+ return this, that
def diff(self, periods=1):
"""
View
@@ -1950,6 +1950,14 @@ def test_corr(self):
assert_almost_equal(correls['A']['C'],
self.frame['A'].corr(self.frame['C']))
+
+ def test_cov(self):
+ self.frame['A'][:5] = nan
+ self.frame['B'][:10] = nan
+ cov = self.frame.cov()
+
+ assert_almost_equal(cov['A']['C'],
+ self.frame['A'].cov(self.frame['C']))
def test_corrwith(self):
a = self.tsframe
@@ -2698,6 +2706,28 @@ def test_sort_index(self):
expected = frame.ix[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)
+ # by multiple columns
+ frame.values[1, 0] = frame.values[0, 0]
+ smaller, larger = min(frame.values[:1, 1]), max(frame.values[:1, 1])
+ if smaller == larger:
+ larger = smaller + 1
+ frame.values[0, 1] = larger
+ frame.values[1, 1] = smaller
+
+ sorted_df = frame.sort_index(by=['A', 'B'])
+ indexer = frame['A'].argsort().values
+ zero_mask = indexer == 0
+ one_mask = indexer == 1
+ indexer[zero_mask] = 1
+ indexer[one_mask] = 0
+ expected = frame.ix[frame.index[indexer]]
+ assert_frame_equal(sorted_df, expected)
+
+ sorted_df = frame.sort_index(by=['A', 'B'], ascending=False)
+ indexer = indexer[::-1]
+ expected = frame.ix[frame.index[indexer]]
+ assert_frame_equal(sorted_df, expected)
+
# check for now
sorted_df = frame.sort(column='A')
expected = frame.sort_index(by='A')
@@ -833,6 +833,16 @@ def test_corr(self):
# additional checks?
+ def test_cov(self):
+ # full overlap
+ self.assertAlmostEqual(self.ts.cov(self.ts), self.ts.std()**2)
+
+ # partial overlap
+ self.assertAlmostEqual(self.ts[:15].cov(self.ts[5:]), self.ts[5:15].std()**2)
+
+ # No overlap
+ self.assert_(np.isnan(self.ts[::2].cov(self.ts[1::2])))
+
def test_copy(self):
ts = self.ts.copy()