Features were computed offline using the procedure described above. Here we load the features for 10,000 VVV light curves

In [None]:
lc_periods = pickle.load(open("data/lc_periods.pkl", "rb"))
data_P = pickle.load(open("data/features_P.pkl", "rb"))
data_U = pickle.load(open("data/features_U.pkl", "rb"))

feature_names = list()
for i in range(40):
    feature_names.append("GP%d" %(i+1))
feature_names.append('NMSE')
feature_names.append('NSFD')
feature_names.append('Median')
feature_names.append('IQR')
feature_names.append('Skewness')
feature_names.append('Kurtosis')
feature_names.append('Freq')

In [None]:
fig = plt.figure(figsize=(16, 2))
for i in range(7):
    ax = fig.add_subplot(1, 7, i+1)
    ax.hist(data_U[:, -i-1])
    ax.set_title(feature_names[-i-1])
    if i == 0:
        ax.set_ylabel('U set')
plt.tight_layout(pad=0.1)

fig = plt.figure(figsize=(16, 2))
for i in range(7):
    ax = fig.add_subplot(1, 7, i+1)
    ax.hist(data_P[:, -i-1])
    ax.set_title(feature_names[-i-1])
    if i == 0:
        ax.set_ylabel('P set')
plt.tight_layout(pad=0.1)

fig = plt.figure(figsize=(16, 2))

ax = fig.add_subplot(1, 2, 1)
class_mu = np.mean(data_U[:, :40], axis=0)
class_std = np.std(data_U[:, :40], axis=0)
ax.plot(range(80), np.tile(class_mu, 2))
ax.fill_between(range(80), np.tile(class_mu - 2*class_std,2), np.tile(class_mu + 2*class_std,2), alpha=0.5)
ax.invert_yaxis()
ylims = ax.get_ylim()
plt.ylabel("GP fit")
plt.title("Average fit in U set")
ax = fig.add_subplot(1, 2, 2)
class_mu = np.mean(data_P[:, :40], axis=0)
class_std = np.std(data_P[:, :40], axis=0)
ax.plot(range(80), np.tile(class_mu, 2))
ax.fill_between(range(80), np.tile(class_mu - 2*class_std,2), np.tile(class_mu + 2*class_std,2), alpha=0.5)
ax.invert_yaxis()
ax.set_ylim(ylims)
plt.title("Average fit in P set")

## Transductive Positive-Unlabeled (PU) learning

PU learning is a special case of semi-supervised learning. The input to PU methods are two datasets: P and U. P is a dataset of known objects of one particular category (in this case the RR Lyrae stars). U is an unlalebed set that may contain elements of P and of many other categories. The objective is to recover all the examples of U that are similar to P. In this case we train a random forest where bootstrap is only used in the U set. Each tree gives a probability to the out-of-bag (oob) samples (points that were not used to train). The average oob prediction is our transductive label for the elements of U.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from joblib import Parallel, delayed
import time

T = 1000  # number of trees
NP = data_P.shape[0]
NU = data_U.shape[0]
K = 2*NP  # Size of the bootstrap 
Y = np.zeros(shape=(NP+K,))
Y[:NP] = 1.0
M = data_U.shape[1]

tic = time.time()
n_oob = np.zeros(shape=(NU,))
f_oob = np.zeros(shape=(NU, 2))
feature_importance = np.zeros(shape=(M,))
models = []

# Create forest
for i in range(T):
    models.append(DecisionTreeClassifier(max_depth=None, max_features='sqrt', criterion='entropy'))

def parallel_transduce_tree(t):
    # Bootstrap resample
    b = np.random.choice(np.arange(NU), replace=True, size=K)
    data_bootstrap = np.concatenate((data_P, data_U[b, :]), axis=0)
    # Train tree
    t.fit(data_bootstrap, Y)
    # Predict in oob
    idx_oob = sorted(set(range(NU)) - set(np.unique(b)))
    return idx_oob, t.predict_proba(data_U[idx_oob]), t.feature_importances_

ans = Parallel(n_jobs=4, backend='threading')(
    delayed(parallel_transduce_tree)(t) for t in models)

for idx_oob, prediction, fimportance in ans:
    f_oob[idx_oob] += prediction
    n_oob[idx_oob] += 1
    feature_importance += fimportance

probs = f_oob/n_oob[:, np.newaxis]
feature_importance = feature_importance/T

print("Elapsed time: %0.2f [s]" %(time.time()-tic))

## Feature importance from the transductive random forest

From the random forest we can recover the importance of the features. The importance comes from the metric that was used to do the feature splits on each tree. In this case we used the information gain (difference in information entropy before and after doing the split). The higher the information gain the higher that the feature is on the tree. If a feature is up in all the trees it is an indicator that it is an important feature. Note that more sofisticated feature selection schemes would also measure the redundancy and/or synergy between features.

In [None]:
fidx = np.argsort(feature_importance)[::-1]

fig = plt.figure(figsize=(12, 4))
ax = fig.add_subplot(1, 1, 1)
ax.bar(np.arange(len(feature_importance)), feature_importance[fidx], alpha=0.5, width=1.0)
plt.xticks(np.arange(M)+0.5, np.array(feature_names)[fidx], rotation='vertical')
plt.ylabel('Feature importance')
plt.grid()

fig = plt.figure(figsize=(10, 3.5))
ax = fig.add_subplot(1, 1, 1)
avg_lc = np.average(data_P[:, :40], axis=0)
sp = ax.scatter(np.linspace(0, 1, num=40), avg_lc, s=400, c=feature_importance[:40], 
                linewidth=0, alpha=0.6, cmap=plt.cm.Spectral_r)
plt.xlabel('Phase')
plt.ylabel('Normalized magnitude')
plt.colorbar(sp, label='Importance')
plt.grid()
ax.invert_yaxis()

The frequency (period) is the most important feature. The points around $\phi=0.2$ are also important as this highlights the characteristic assymetry in the RR Lyrae light curves 