Skip to content

Commit

Permalink
Merge 322fe6f into 794cddc
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks committed Jan 11, 2019
2 parents 794cddc + 322fe6f commit 840891d
Show file tree
Hide file tree
Showing 23 changed files with 141 additions and 145 deletions.
12 changes: 6 additions & 6 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
language: python

env:
- PYTHON=2.7 PANDAS=0.20.2 SKLEARN=0.18.2
- PYTHON=2.7 PANDAS=0.21.0 SKLEARN=0.19.1 IMBALANCE=true
- PYTHON=3.5 PANDAS=0.19.1 SKLEARN=0.18.2
- PYTHON=3.6 PANDAS=0.20.2 SKLEARN=0.18.2 IMBALANCE=true
- PYTHON=3.6 PANDAS=0.21.0 SKLEARN=0.19.1 IMBALANCE=true COVERAGE=true
- PYTHON=2.7 PANDAS=0.19.2 SKLEARN=0.19.1
- PYTHON=2.7 PANDAS=0.20.2 SKLEARN=0.20.2 IMBALANCE=true
- PYTHON=3.5 PANDAS=0.21.1 SKLEARN=0.19.1
- PYTHON=3.6 PANDAS=0.22.0 SKLEARN=0.19.1 IMBALANCE=true
- PYTHON=3.7 PANDAS=0.23.2 SKLEARN=0.20.2 IMBALANCE=true COVERAGE=true

addons:
apt:
Expand All @@ -24,7 +24,7 @@ script:
- export PATH="$HOME/miniconda/bin:$PATH"
- source activate myenv
- python -m nose --with-coverage --cover-package=pandas_ml
- flake8 --ignore E501 pandas_ml
- flake8 --ignore E501,W503 pandas_ml

after_success:
- if [ "$COVERAGE" ]; then
Expand Down
14 changes: 14 additions & 0 deletions doc/source/whatsnew.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@
What's new
==========

v0.7.0
------

Enhancement
^^^^^^^^^^^

- Support pandas v0.22.0 and scikit-learn 0.20.0.

API Change
^^^^^^^^^^

- `ModelFrame.model_selection.describe` now returns `ModelFrame` compat with
`GridSearchCV.cv_results_`

v0.6.0
------

Expand Down
11 changes: 10 additions & 1 deletion pandas_ml/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,18 @@

PANDAS_VERSION = LooseVersion(pd.__version__)

if PANDAS_VERSION >= LooseVersion('0.23'):
_PANDAS_ge_023 = True
else:
_PANDAS_ge_023 = False

if PANDAS_VERSION >= LooseVersion('0.22'):
_PANDAS_ge_022 = True
else:
_PANDAS_ge_022 = False

if PANDAS_VERSION >= LooseVersion('0.21'):
_PANDAS_ge_021 = True

else:
_PANDAS_ge_021 = False

Expand Down
8 changes: 3 additions & 5 deletions pandas_ml/confusion_matrix/bcm.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,12 +296,10 @@ def F1_score(self):
def MCC(self):
"""
Matthews correlation coefficient (MCC)
\frac{ TP \times TN - FP \times FN }
{\sqrt{ (TP+FP) ( TP + FN ) ( TN + FP ) ( TN + FN ) }
"""
return((self.TP * self.TN - self.FP * self.FN) /
math.sqrt((self.TP + self.FP) * (self.TP + self.FN) *
(self.TN + self.FP) * (self.TN + self.FN)))
return((self.TP * self.TN - self.FP * self.FN)
/ math.sqrt((self.TP + self.FP) * (self.TP + self.FN)
* (self.TN + self.FP) * (self.TN + self.FN)))

@property
def informedness(self):
Expand Down
20 changes: 16 additions & 4 deletions pandas_ml/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,15 +413,24 @@ def _wrap_predicted(self, predicted, estimator):
dict(funcname='fit_sample', returned='returned : sampling result'))
def fit_sample(self, estimator, *args, **kwargs):
# for imblearn
sampled_X, sampled_y = self._call(estimator, 'fit_sample', *args, **kwargs)
msg = ".fit_sample is deprecated. Use .fit_resample instead"
warnings.warn(msg, DeprecationWarning)
return self.fit_resample(estimator, *args, **kwargs)

@Appender(_shared_docs['estimator_methods'] %
dict(funcname='fit_resample', returned='returned : resampling result'))
def fit_resample(self, estimator, *args, **kwargs):
# for imblearn
sampled_X, sampled_y = self._call(estimator, 'fit_resample', *args, **kwargs)
return self._wrap_sampled(sampled_X, sampled_y)

@Appender(_shared_docs['estimator_methods'] %
dict(funcname='sample', returned='returned : sampling result'))
def sample(self, estimator, *args, **kwargs):
# for imblearn
sampled_X, sampled_y = self._call(estimator, 'sample', *args, **kwargs)
return self._wrap_sampled(sampled_X, sampled_y)
msg = ".sample is deprecated. Use .fit_resample instead"
warnings.warn(msg, DeprecationWarning)
return self.fit_resample(estimator, *args, **kwargs)

def _wrap_sampled(self, sampled_X, sampled_y):
# revert sampled results to ModelFrame, index is being reset
Expand Down Expand Up @@ -454,7 +463,10 @@ def transform(self, estimator, *args, **kwargs):
# set inverse columns
estimator._pdml_original_columns = self.data.columns
return transformed
except: # noqa
except ImportError:
# raise patsy error
raise
except Exception as e: # noqa
return pd.DataFrame.transform(self, estimator, *args, **kwargs)

@Appender(_shared_docs['estimator_methods'] %
Expand Down
6 changes: 4 additions & 2 deletions pandas_ml/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ def _wrap_transform(self, transformed, columns=None):
Wrapper for transform methods
"""
if len(transformed.shape) == 2:
if (util._is_1d_harray(transformed) or
util._is_1d_varray(transformed)):
if (util._is_1d_harray(transformed) or util._is_1d_varray(transformed)):
transformed = transformed.flatten()
else:
from pandas_ml.core.frame import ModelFrame
Expand All @@ -46,6 +45,9 @@ def transform(self, estimator, *args, **kwargs):
try:
transformed = super(ModelSeries, self).transform(estimator, *args, **kwargs)
return transformed
except ImportError:
# raise patsy error
raise
except: # noqa
return pd.Series.transform(self, estimator, *args, **kwargs)

Expand Down
8 changes: 4 additions & 4 deletions pandas_ml/imbaccessors/test/test_imbalance.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ def test_sample(self):
df.fit(mod1)
mod2.fit(X, y)

result = df.sample(mod1)
expected_X, expected_y = mod2.sample(X, y)
result = df.fit_resample(mod1)
expected_X, expected_y = mod2.fit_resample(X, y)

self.assertIsInstance(result, pdml.ModelFrame)
tm.assert_numpy_array_equal(result.target.values, expected_y)
Expand Down Expand Up @@ -119,8 +119,8 @@ def test_sample_ensemble(self):
df.fit(mod1)
mod2.fit(X, y)

results = df.sample(mod1)
expected_X, expected_y = mod2.sample(X, y)
results = df.fit_resample(mod1)
expected_X, expected_y = mod2.fit_resample(X, y)

self.assertIsInstance(results, list)
for r in results:
Expand Down
3 changes: 3 additions & 0 deletions pandas_ml/misc/patsy_wraps.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@

def transform_with_patsy(formula, data, *args, **kwargs):
try:
# needs patsy v0.5.1 to support formula in Python 3.7
# https://github.com/pydata/patsy/pull/131
import patsy
except ImportError:
raise ImportError("'patsy' is required to transform with string formula")

if '~' in formula:
y, X = patsy.dmatrices(formula, data=data, return_type='dataframe',
*args, **kwargs)
Expand Down
4 changes: 2 additions & 2 deletions pandas_ml/skaccessors/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,8 @@ def pairwise(self):
_regression_methods = ['explained_variance_score', 'mean_absolute_error',
'mean_squared_error', 'r2_score']
_cluster_methods = ['mutual_info_score']
_true_pred_methods = (_classification_methods + _regression_methods +
_cluster_methods)
_true_pred_methods = (_classification_methods + _regression_methods
+ _cluster_methods)
_attach_methods(MetricsMethods, _wrap_target_pred_func, _true_pred_methods)


Expand Down
11 changes: 1 addition & 10 deletions pandas_ml/skaccessors/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,16 +128,7 @@ def describe(self, estimator):
-------
described : ``ModelFrame``
"""
results = []
for params, mean_score, scores in estimator.grid_scores_:
row = dict(mean=mean_score, std=scores.std())
row.update(params)
results.append(row)
df = self._constructor(results)

scores = pd.Index(['mean', 'std'])
df = df[scores.append(df.columns[~df.columns.isin(scores)])]
return df
return self._constructor(estimator.cv_results_)

# Model validation

Expand Down
4 changes: 0 additions & 4 deletions pandas_ml/skaccessors/test/test_decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@ def test_objectmapper(self):
self.assertIs(df.decomposition.PCA, decomposition.PCA)
self.assertIs(df.decomposition.IncrementalPCA,
decomposition.IncrementalPCA)

self.assertIs(df.decomposition.RandomizedPCA,
decomposition.RandomizedPCA)

self.assertIs(df.decomposition.KernelPCA, decomposition.KernelPCA)
self.assertIs(df.decomposition.FactorAnalysis,
decomposition.FactorAnalysis)
Expand Down
83 changes: 48 additions & 35 deletions pandas_ml/skaccessors/test/test_gaussian_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ def test_objectmapper(self):
gp.GaussianProcessClassifier)
self.assertIs(dgp.GaussianProcessRegressor,
gp.GaussianProcessRegressor)
self.assertIs(dgp.GaussianProcess, gp.GaussianProcess)
self.assertIs(dgp.correlation_models.absolute_exponential,
gp.correlation_models.absolute_exponential)
self.assertIs(dgp.correlation_models.squared_exponential,
Expand Down Expand Up @@ -85,35 +84,27 @@ def test_quadratic(self):
expected = gp.regression_models.quadratic(X)
self.assert_numpy_array_almost_equal(result, expected)

def test_GaussianProcess_lt_017(self):
# http://scikit-learn.org/stable/modules/gaussian_process.html
def test_GaussianProcess_ge_018(self):
X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
y = np.sin(X).ravel()
df = pdml.ModelFrame(X, target=y)

g1 = df.gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4,
thetaU=1e-1)
g2 = gp.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1)
k1 = (df.gp.kernels.ConstantKernel(1.0, (1e-3, 1e3))
* df.gp.kernels.RBF(10, (1e-2, 1e2)))
g1 = df.gp.GaussianProcessRegressor(kernel=k1, n_restarts_optimizer=9,
random_state=self.random_state)

k2 = (gp.kernels.ConstantKernel(1.0, (1e-3, 1e3))
* gp.kernels.RBF(10, (1e-2, 1e2)))
g2 = gp.GaussianProcessRegressor(kernel=k2, n_restarts_optimizer=9,
random_state=self.random_state)

g1.fit(X, y)
g2.fit(X, y)

x = np.atleast_2d(np.linspace(0, 10, 1000)).T
tdf = pdml.ModelFrame(x)

y_result, sigma2_result = tdf.predict(g1, eval_MSE=True)
y_expected, sigma2_expected = g2.predict(x, eval_MSE=True)

self.assertIsInstance(y_result, pdml.ModelSeries)
tm.assert_index_equal(y_result.index, tdf.index)

self.assertIsInstance(sigma2_result, pdml.ModelSeries)
tm.assert_index_equal(sigma2_result.index, tdf.index)

self.assert_numpy_array_almost_equal(y_result.values, y_expected)
self.assert_numpy_array_almost_equal(sigma2_result.values,
sigma2_expected)

y_result = tdf.predict(g1)
y_expected = g2.predict(x)

Expand All @@ -122,18 +113,18 @@ def test_GaussianProcess_lt_017(self):

self.assert_numpy_array_almost_equal(y_result, y_expected)

def test_GaussianProcess_ge_018(self):
def test_GaussianProcess_std(self):
X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
y = np.sin(X).ravel()
df = pdml.ModelFrame(X, target=y)

k1 = (df.gp.kernels.ConstantKernel(1.0, (1e-3, 1e3)) *
df.gp.kernels.RBF(10, (1e-2, 1e2)))
k1 = (df.gp.kernels.ConstantKernel(1.0, (1e-3, 1e3))
* df.gp.kernels.RBF(10, (1e-2, 1e2)))
g1 = df.gp.GaussianProcessRegressor(kernel=k1, n_restarts_optimizer=9,
random_state=self.random_state)

k2 = (gp.kernels.ConstantKernel(1.0, (1e-3, 1e3)) *
gp.kernels.RBF(10, (1e-2, 1e2)))
k2 = (gp.kernels.ConstantKernel(1.0, (1e-3, 1e3))
* gp.kernels.RBF(10, (1e-2, 1e2)))
g2 = gp.GaussianProcessRegressor(kernel=k2, n_restarts_optimizer=9,
random_state=self.random_state)

Expand All @@ -156,15 +147,37 @@ def test_GaussianProcess_ge_018(self):
self.assert_numpy_array_almost_equal(std_result.values,
std_expected)

y_result = tdf.predict(g1)
y_expected = g2.predict(x)
def test_Gaussian2D(self):
# http://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gp_probabilistic_classification_after_regression.html

self.assertIsInstance(y_result, pdml.ModelSeries)
tm.assert_index_equal(y_result.index, tdf.index)
def g(x):
"""The function to predict (classification will then consist in predicting
whether g(x) <= 0 or not)"""
return 5. - x[:, 1] - .5 * x[:, 0] ** 2.

self.assert_numpy_array_almost_equal(y_result, y_expected)
# Design of experiments
X = np.array([[-4.61611719, -6.00099547],
[4.10469096, 5.32782448],
[0.00000000, -0.50000000],
[-6.17289014, -4.6984743],
[1.3109306, -6.93271427],
[-5.03823144, 3.10584743],
[-2.87600388, 6.74310541],
[5.21301203, 4.26386883]])
y = g(X)

def test_Gaussian2D(self):
df = pdml.ModelFrame(X, target=y)
gpm1 = df.gaussian_process.GaussianProcessRegressor()
df.fit(gpm1)
result = df.predict(gpm1)

gpm2 = gp.GaussianProcessRegressor()
gpm2.fit(X, y)
expected = gpm2.predict(X)

self.assert_numpy_array_almost_equal(result.values, expected)

def test_Gaussian2D_std(self):
# http://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gp_probabilistic_classification_after_regression.html

def g(x):
Expand All @@ -184,16 +197,16 @@ def g(x):
y = g(X)

df = pdml.ModelFrame(X, target=y)
gpm1 = df.gaussian_process.GaussianProcess(theta0=5e-1)
gpm1 = df.gaussian_process.GaussianProcessRegressor()
df.fit(gpm1)
result, result_MSE = df.predict(gpm1, eval_MSE=True)
result, std_result = df.predict(gpm1, return_std=True)

gpm2 = gp.GaussianProcess(theta0=5e-1)
gpm2 = gp.GaussianProcessRegressor()
gpm2.fit(X, y)
expected, expected_MSE = gpm2.predict(X, eval_MSE=True)
expected, std_expected = gpm2.predict(X, return_std=True)

self.assert_numpy_array_almost_equal(result.values, expected)
self.assert_numpy_array_almost_equal(result_MSE.values, expected_MSE)
self.assert_numpy_array_almost_equal(std_result.values, std_expected)


if __name__ == '__main__':
Expand Down
3 changes: 1 addition & 2 deletions pandas_ml/skaccessors/test/test_mixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,12 @@ def test_objectmapper(self):
self.assertIs(df.mixture.GaussianMixture, mixture.GaussianMixture)
self.assertIs(df.mixture.BayesianGaussianMixture,
mixture.BayesianGaussianMixture)
self.assertIs(df.mixture.DPGMM, mixture.DPGMM)

def test_Classifications(self):
iris = datasets.load_iris()
df = pdml.ModelFrame(iris)

models = ['GMM', 'DPGMM', 'VBGMM']
models = ['GaussianMixture', 'BayesianGaussianMixture']
for model in models:
mod1 = getattr(df.mixture, model)(random_state=self.random_state)
mod2 = getattr(mixture, model)(random_state=self.random_state)
Expand Down
10 changes: 1 addition & 9 deletions pandas_ml/skaccessors/test/test_model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,15 +338,7 @@ def test_grid_search(self):
df.fit(cv)

result = df.model_selection.describe(cv)
expected = pd.DataFrame({'mean': [0.97161937, 0.9476906, 0.97273233, 0.95937674, 0.97273233,
0.96271564, 0.94936004, 0.94936004, 0.94936004],
'std': [0.01546977, 0.0221161, 0.01406514, 0.02295168, 0.01406514,
0.01779749, 0.01911084, 0.01911084, 0.01911084],
'C': [1, 1, 10, 10, 100, 100, 1, 10, 100],
'gamma': [0.001, 0.0001, 0.001, 0.0001, 0.001, 0.0001,
np.nan, np.nan, np.nan],
'kernel': ['rbf'] * 6 + ['linear'] * 3},
columns=['mean', 'std', 'C', 'gamma', 'kernel'])
expected = pd.DataFrame(cv.cv_results_)
self.assertIsInstance(result, pdml.ModelFrame)
tm.assert_frame_equal(result, expected)

Expand Down

0 comments on commit 840891d

Please sign in to comment.