In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
import sys; sys.path.insert(0,'..')
from main import *

from SU_Classification.su_learning import *
from SU_Classification import misc

In [3]:
X_train_orig, y_train_orig = load_svmlight_file('../data/adult/a5a.txt', n_features=123)
X_train_orig = X_train_orig.toarray()

X_test_orig, y_test_orig = load_svmlight_file('../data/adult/a5a.t')
X_test_orig = X_test_orig.toarray()

In [4]:
def get_similar_unlabeled(X_train, y_train, ns, nu, prior=0.7, train_size=5000, su_cutoff=2500):
    """ 
    Returns similar and unlabeled pairs: xs, xu
    
    Note) true prior is set as 0.7 to be consistent with results in Table 4 of the Tokyo SU
          paper, train_size and su_cutoff are used to ensure that similar and unlabeled points
          are from disjoint sets
    """

    # Only use some of training data
    train_samples = np.random.permutation(X_train.shape[0])[:train_size]
    X_train = X_train[train_samples]
    y_train = y_train[train_samples]

    # Sample similar and unlabeled points from disjoint sets
    X_s_set = X_train[:su_cutoff]
    y_s_set = y_train[:su_cutoff]
    X_u_set = X_train[su_cutoff:]
    y_u_set = y_train[su_cutoff:]

    # Calculate number of positive/negative similar pairs using prior
    nsp = np.random.binomial(ns, prior**2 / (prior**2 + (1-prior)**2))
    nsn = ns - nsp
    
    # Similar: get positive pairs and negative pairs
    X_s_pos = X_s_set[np.where(y_s_set ==  1)]
    X_s_neg = X_s_set[np.where(y_s_set == -1)]

    # Get similar pairs
    xs = np.concatenate((np.hstack((X_s_pos[np.random.choice(X_s_pos.shape[0], nsp)], 
                                    X_s_pos[np.random.choice(X_s_pos.shape[0], nsp)])),
                         np.hstack((X_s_neg[np.random.choice(X_s_neg.shape[0], nsn)], 
                                    X_s_neg[np.random.choice(X_s_neg.shape[0], nsn)]))))

    # Calculate number of positve/negative unlabeled points using prior
    nup = np.random.binomial(nu, prior)
    nun = nu - nup

    # Unlabeled: get positive pairs and negative pairs
    X_u_pos = X_u_set[np.where(y_u_set ==  1)]
    X_u_neg = X_u_set[np.where(y_u_set == -1)]

    # Get unlabeled points
    xu = np.concatenate((X_u_pos[np.random.choice(X_u_pos.shape[0], nup)], 
                         X_u_neg[np.random.choice(X_u_neg.shape[0], nun)]))
    
    return xs, xu

In [5]:
acc_ls = []

for i in range(100):

    xs, xu = get_similar_unlabeled(X_train_orig, y_train_orig, 500, 500)
    X_train, y_train = convert_su_data_sklearn_compatible(xs, xu)

#     est_prior = class_prior_estimation(xs, xu)
    est_prior = 0.54

    # cross-validation
    lam_list = [1e-01, 1e-04, 1e-07]
    score_cv_list = []
    for lam in lam_list:
        clf = SU_SL(prior=est_prior, lam=lam)
        score_cv = cross_val_score(clf, X_train, y_train, cv=5).mean()
        score_cv_list.append(score_cv)

    # training with the best hyperparameter
    lam_best = lam_list[np.argmax(score_cv_list)]
    clf = SU_SL(prior=est_prior, lam=lam_best)
    clf.fit(X_train, y_train)

    # test prediction
    y_pred = clf.predict(X_test_orig)
    accuracy = accuracy_score(y_test_orig, y_pred)
    acc_ls.append(max(accuracy, 1 - accuracy))
#     print('%.2f' % acc_ls[i])

#     print(np.unique(y_pred, return_counts=True), 
#           np.unique(y_test_orig, return_counts=True),
#           sep='\n')

np.mean(acc_ls)

0.6202971660228707