In [12]:
%load_ext autoreload
%autoreload 2
import os, sys
algo_root = '..'
sys.path.insert(0, algo_root)
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from tools.gm_tools import gm_params_generator, gaussian_mixture_sample, covar_estim, score, tau_estim
from tools.algorithms_benchmark import view2Ddata
from tools.gm_tools import score
from cluster.sq_root_lasso import sqrt_lasso_gmm
from sklearn.mixture import GMM

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [55]:
pi, means, covars = gm_params_generator(2,5)
#pi = np.array([0.2 , 0.4, 0.4])
#means = np.array( [[0,0],[2,2],[2,0]])
X,_ = gaussian_mixture_sample(pi, means, covars, 1e5)
#view2Ddata(X)

In [57]:
X_train, X_validation, y_train, y_test = train_test_split(
    X, np.zeros(len(X)), test_size=0.2, random_state=0)

In [51]:
#grid search on sq_root_lasso method
max_clusters = 8
lambd = np.sqrt(2*np.log(max_clusters)/X_train.shape[0])
param = {"lambd":[lambd, lambd+10, lambd+1e2, lambd+1e3], "lipz_c":[1, 1e1, 1e2, 1e3], "max_clusters":[max_clusters]}
clf = GridSearchCV(estimator=sqrt_lasso_gmm(n_iter=50), param_grid=param, cv=5, n_jobs=-1)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=sqrt_lasso_gmm(lambd=1, lipz_c=1, max_clusters=8, n_iter=50, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'lipz_c': [1, 100], 'max_clusters': [5], 'lambd': [0.020058900221809894, 1.0200589002218099]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [52]:
#we define a bic scoring method for the grid search
def bic_scorer(estimator, X, y=None):
    return (2*score(X, estimator.weights_, estimator.means_, estimator.covars_ ) -
            estimator._n_parameters()*np.log(X.shape[0]))

params_GMM={"n_components":range(2,max_clusters+1)}
clf_gmm = GridSearchCV(GMM(), param_grid=params_GMM, cv=5, n_jobs=-1, scoring=bic_scorer)
clf_gmm.fit(X_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=GMM(covariance_type='diag', init_params='wmc', min_covar=0.001,
  n_components=1, n_init=1, n_iter=100, params='wmc', random_state=None,
  thresh=None, tol=0.001, verbose=0),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_components': [2, 3, 4, 5, 6, 7]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function bic_scorer at 0x7f57b29629b0>, verbose=0)

In [53]:
#we evaluate the loglikelihood of the fitted models on X_validation
print "real loglikelihood: ", 1./X_validation.shape[0]*score(X_validation, pi, means, covars)
print "real pi:", pi
print "real means:", means
print "###sq_root lasso method###"
print "sq_root lasso method loglikelihood:", 1./X_validation.shape[0]*score(X_validation, clf.best_estimator_.pi_, clf.best_estimator_.means_, clf.best_estimator_.covars_)
print "pi: ", clf.best_estimator_.pi_
print "means: ", clf.best_estimator_.means_
print "grid search best params:", clf.best_params_
print "###EM + BIC###"
print "EM loglikelihood:", 1./X_validation.shape[0]*score(X_validation, clf_gmm.best_estimator_.weights_, clf_gmm.best_estimator_.means_, clf_gmm.best_estimator_.covars_)
print "pi: ", clf_gmm.best_estimator_.weights_
print "means: ", clf_gmm.best_estimator_.means_
print "grid search best params:", clf_gmm.best_params_


real loglikelihood:  2.68053877041
real pi: [ 0.2  0.4  0.4]
real means: [[0 0]
 [2 2]
 [2 0]]
###sq_root lasso method###
sq_root lasso method loglikelihood: 2.54964435881
pi:  [0.39244106072983204, 0.40613092616687047, 0.20142801310329761]
means:  [array([ 2.00674191,  2.0066682 ]), array([  2.00695056e+00,   1.31126697e-04]), array([ 0.00144216, -0.00092002])]
grid search best params: {'lipz_c': 1, 'max_clusters': 5, 'lambd': 0.020058900221809894}
###EM + BIC###
EM loglikelihood: 2.54874389886
pi:  [ 0.198875  0.407375  0.39375 ]
means:  [[  1.46066879e-03  -9.31831392e-04]
 [  2.00082158e+00   1.30726252e-04]
 [  2.00007092e+00   1.99999745e+00]]
grid search best params: {'n_components': 3}


In [49]:
clf.grid_scores_

[mean: 4060.13955, std: 20.23495, params: {'lipz_c': 1, 'max_clusters': 7, 'lambd': 0.022056235790901137},
 mean: 4059.59639, std: 20.71163, params: {'lipz_c': 10, 'max_clusters': 7, 'lambd': 0.022056235790901137},
 mean: 4055.78764, std: 19.74048, params: {'lipz_c': 100, 'max_clusters': 7, 'lambd': 0.022056235790901137},
 mean: -10455.07918, std: 7345.82141, params: {'lipz_c': 1000, 'max_clusters': 7, 'lambd': 0.022056235790901137},
 mean: -4819.13629, std: 6967.13678, params: {'lipz_c': 10000, 'max_clusters': 7, 'lambd': 0.022056235790901137},
 mean: 3988.09737, std: 71.72656, params: {'lipz_c': 1, 'max_clusters': 7, 'lambd': 0.12205623579090114},
 mean: 3960.56118, std: 22.31289, params: {'lipz_c': 10, 'max_clusters': 7, 'lambd': 0.12205623579090114},
 mean: 3976.10568, std: 18.89363, params: {'lipz_c': 100, 'max_clusters': 7, 'lambd': 0.12205623579090114},
 mean: -2719.65654, std: 6972.95227, params: {'lipz_c': 1000, 'max_clusters': 7, 'lambd': 0.12205623579090114},
 mean: -9848.59

In [50]:
clf_gmm.grid_scores_

[mean: 3411.95855, std: 141.65137, params: {'n_components': 2},
 mean: 8109.90621, std: 70.19803, params: {'n_components': 3},
 mean: 8073.01742, std: 70.19804, params: {'n_components': 4},
 mean: 8036.14027, std: 70.19280, params: {'n_components': 5},
 mean: 7999.24634, std: 70.19952, params: {'n_components': 6},
 mean: 7962.35873, std: 70.19886, params: {'n_components': 7}]

10.0