In [12]:
%load_ext autoreload
%autoreload 2
import os, sys
algo_root = '..'
sys.path.insert(0, algo_root)
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from tools.gm_tools import gm_params_generator, gaussian_mixture_sample, covar_estim, score, tau_estim
from tools.algorithms_benchmark import view2Ddata
from tools.gm_tools import score
from cluster.sq_root_lasso import sqrt_lasso_gmm
from sklearn.mixture import GMM

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [32]:
pi, means, covars = gm_params_generator(2,4)
X,_ = gaussian_mixture_sample(pi, means, covars, 1e4)
#view2Ddata(X)

In [33]:
X_train, X_validation, y_train, y_test = train_test_split(
    X, np.zeros(len(X)), test_size=0.2, random_state=0)

In [18]:
#grid search on sq_root_lasso method
max_clusters = 7
lambd = np.sqrt(2*np.log(max_clusters)/X_train.shape[0])
param = {"lambd":[lambd, lambd+1e-1, lambd+1, lambd+10, lambd+100], "lipz_c":[1, 10, 100, 1000, 10000], "max_clusters":[max_clusters]}
clf = GridSearchCV(estimator=sqrt_lasso_gmm(), param_grid=param, cv=5, n_jobs=-1)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=sqrt_lasso_gmm(lambd=1, lipz_c=1, max_clusters=8, n_iter=100, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'lipz_c': [1, 10, 100, 1000, 10000], 'max_clusters': [7], 'lambd': [0.022056235790901137, 0.12205623579090114, 1.0220562357909011, 10.022056235790901, 100.0220562357909]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)




In [36]:
#we define a bic scoring method for the grid search
def bic_scorer(estimator, X, y=None):
    return (-2*score(X, estimator.weights_, estimator.means_, estimator.covars_ ) +
            estimator._n_parameters()*np.log(X.shape[0]))

params_GMM={"n_components":range(2,8)}
clf_gmm = GridSearchCV(GMM(), param_grid=params_GMM, cv=5, n_jobs=-1, scoring=bic_scorer)
clf_gmm.fit(X_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=GMM(covariance_type='diag', init_params='wmc', min_covar=0.001,
  n_components=1, n_init=1, n_iter=100, params='wmc', random_state=None,
  thresh=None, tol=0.001, verbose=0),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_components': [2, 3, 4, 5, 6, 7]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function bic_scorer at 0x7f57b70547d0>, verbose=0)

In [37]:
#we evaluate the loglikelihood of the fitted models on X_validation
print "real loglikelihood: ", 1./X_validation.shape[0]*score(X_validation, pi, means, covars)
print "real pi:", pi
print "real means:", means
print "###sq_root lasso method###"
print "sq_root lasso method loglikelihood:", 1./X_validation.shape[0]*score(X_validation, clf.best_estimator_.pi_, clf.best_estimator_.means_, clf.best_estimator_.covars_)
print "pi: ", clf.best_estimator_.pi_
print "means: ", clf.best_estimator_.means_
print "grid search best params:", clf.best_params_
print "###EM + BIC###"
print "EM loglikelihood:", 1./X_validation.shape[0]*score(X_validation, clf_gmm.best_estimator_.weights_, clf_gmm.best_estimator_.means_, clf_gmm.best_estimator_.covars_)
print "pi: ", clf_gmm.best_estimator_.weights_
print "means: ", clf_gmm.best_estimator_.means_
print "grid search best params:", clf_gmm.best_params_


real loglikelihood:  2.81269509428
real pi: [ 0.26936416  0.1716763   0.42601156  0.13294798]
real means: [array([-0.43226495, -0.41338618]), array([ 0.0882207 ,  0.23866644]), array([-0.45780743, -0.08397424]), array([ 0.10358786, -0.17054718])]
###sq_root lasso method###
sq_root lasso method loglikelihood: -30.8398345336
pi:  [0.015591783151773588, 0.017061859323466749, 0.015881762944523935, 0.00012298709269542143, 0.61916922844898858, 0.012955419532428191, 0.31921695950612361]
means:  [array([ 0.1587062 ,  0.31733449]), array([ 0.18933663, -0.26139514]), array([ 0.15581441,  0.32244557]), array([  4.70824970e-05,  -3.30100391e-06]), array([ 0.16449963,  0.32428367]), array([ 0.1573244 ,  0.32307177]), array([ 0.18546084, -0.25353851])]
grid search best params: {'lipz_c': 1, 'max_clusters': 7, 'lambd': 0.022056235790901137}
###EM + BIC###
EM loglikelihood: 1.69086202784
pi:  [ 0.6985  0.3015]
means:  [[-0.44789189 -0.21130053]
 [ 0.09486129  0.06670401]]
grid search best params: {'n_

In [30]:
clf.grid_scores_

[mean: 4060.13955, std: 20.23495, params: {'lipz_c': 1, 'max_clusters': 7, 'lambd': 0.022056235790901137},
 mean: 4059.59639, std: 20.71163, params: {'lipz_c': 10, 'max_clusters': 7, 'lambd': 0.022056235790901137},
 mean: 4055.78764, std: 19.74048, params: {'lipz_c': 100, 'max_clusters': 7, 'lambd': 0.022056235790901137},
 mean: -10455.07918, std: 7345.82141, params: {'lipz_c': 1000, 'max_clusters': 7, 'lambd': 0.022056235790901137},
 mean: -4819.13629, std: 6967.13678, params: {'lipz_c': 10000, 'max_clusters': 7, 'lambd': 0.022056235790901137},
 mean: 3988.09737, std: 71.72656, params: {'lipz_c': 1, 'max_clusters': 7, 'lambd': 0.12205623579090114},
 mean: 3960.56118, std: 22.31289, params: {'lipz_c': 10, 'max_clusters': 7, 'lambd': 0.12205623579090114},
 mean: 3976.10568, std: 18.89363, params: {'lipz_c': 100, 'max_clusters': 7, 'lambd': 0.12205623579090114},
 mean: -2719.65654, std: 6972.95227, params: {'lipz_c': 1000, 'max_clusters': 7, 'lambd': 0.12205623579090114},
 mean: -9848.59

In [31]:
clf_gmm.grid_scores_

[mean: -8033.73839, std: 49.79955, params: {'covariance_type': 'full', 'n_components': 2},
 mean: -7992.80717, std: 49.86288, params: {'covariance_type': 'full', 'n_components': 3},
 mean: -7947.87549, std: 49.97700, params: {'covariance_type': 'full', 'n_components': 4},
 mean: -7903.39706, std: 49.88329, params: {'covariance_type': 'full', 'n_components': 5},
 mean: -7859.14741, std: 49.90600, params: {'covariance_type': 'full', 'n_components': 6},
 mean: -7815.15852, std: 49.99664, params: {'covariance_type': 'full', 'n_components': 7}]