In [None]:
from visualization_fct import *
from sklearn.mixture import GaussianMixture

from bokeh.plotting import output_notebook, show

%matplotlib notebook
output_notebook()

In [None]:
data = pd.read_csv("./asm_data_for_ml.txt", sep='\t')
del data['MJD']
del data['error']
del data['errorA']
del data['errorB']
del data['errorC']

data['rateCA'] = data.rateC / data.rateA

data_thr = mask(data, 'orbit')  # rm too large values except for 'orbit'

In [None]:
# gmm model selection with bic:
np.random.seed(0)

X = np.c_[data_thr.orbit, data_thr.rateA, data_thr.rateB, data_thr.rateC,
          data_thr.rate, data_thr.rateCA]

lowest_bic = np.infty
bic = []
n_components_range = range(1, 8)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a mixture of Gaussians with EM
        gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type)
        gmm.fit(X)
        bic.append(gmm.bic(X))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm
print best_gmm.covariance_type, best_gmm.n_components

preds = best_gmm.predict(X)
probs = best_gmm.predict_proba(X)

for name, col  in zip(cv_types, np.array(bic).reshape(-1, len(cv_types)).T): 
    plt.plot(n_components_range, col, label=name)
plt.legend()

In [None]:
data_thr['preds'] = pd.Series(preds).astype("category")

color_key = ["red", "yellow", "blue", "grey", "black", "purple", "pink",
             "brown", "green", "orange"]  # Spectral9
color_key = color_key[:len(set(preds))+1]

In [None]:
myplot(data_thr, 'rateCA', 'rate')

In [None]:
p = scatter_matrix(data_thr, spread=True, color_key=color_key)
show(p)

In [None]:
fig = plot_probs_datashader(probs)
show(fig)

In [None]:
scatter_matrix_seaborn(data_thr)

In [None]:
plot_probas(data_thr, probs, color_key=color_key)

In [None]:
# gmm model selection with aic:
np.random.seed(0)

X = np.c_[data_thr.orbit, data_thr.rateA, data_thr.rateB, data_thr.rateC,
          data_thr.rate, data_thr.rateCA]

lowest_aic = np.infty
aic = []
n_components_range = range(1, 9)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a mixture of Gaussians with EM
        gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type)
        gmm.fit(X)
        aic.append(gmm.aic(X))
        if aic[-1] < lowest_aic:
            lowest_aic = aic[-1]
            best_gmm = gmm
print best_gmm.covariance_type, best_gmm.n_components

preds = best_gmm.predict(X)
probs = best_gmm.predict_proba(X)

data_thr['preds'] = pd.Series(preds).astype("category")
color_key = ["red", "yellow", "blue", "grey", "black", "purple", "pink",
             "brown", "green", "orange"]  # Spectral9
color_key = color_key[:len(set(preds))+1]

for name, col  in zip(cv_types, np.array(aic).reshape(-1, len(cv_types)).T): 
    plt.plot(n_components_range, col, label=name)
plt.legend()

In [None]:
myplot(data_thr, 'rateCA', 'rate')

In [None]:
p = scatter_matrix(data_thr, spread=True, color_key=color_key)
show(p)

In [None]:
fig = plot_probs_datashader(probs)
show(fig)

In [None]:
plot_probas(data_thr, probs, color_key=color_key)