In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
import sys; sys.path.insert(0,'..')
from main import *

from SU_Classification.su_learning import *
from SU_Classification import misc

In [2]:
X, y = load_svmlight_file('../data/phishing/phishing.txt')
X = X.toarray()

# Convert labels from {0, 1} to {-1, 1}
y = y * 2 - 1

X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, train_size=0.5, stratify=y, 
                                                    shuffle=True, random_state=None)

In [22]:
def get_similar_unlabeled(X_train, y_train, X_test, y_test, ns, nu, 
                          prior=0.7, train_size=5000, test_size=500, su_cutoff=2500):
    """ 
    Returns similar and unlabeled pairs: xs, xu
    
    Note) true prior is set as 0.7 to be consistent with results in Table 4 of the Tokyo SU
          paper, train_size and su_cutoff are used to ensure that similar and unlabeled points
          are from disjoint sets
    """

    # Only use some of training data
    train_samples = np.random.permutation(X_train.shape[0])[:train_size]
    X_train = X_train[train_samples]
    y_train = y_train[train_samples]

    # Sample similar and unlabeled points from disjoint sets
    X_s_set = X_train[:su_cutoff]
    y_s_set = y_train[:su_cutoff]
    X_u_set = X_train[su_cutoff:]
    y_u_set = y_train[su_cutoff:]

    # Calculate number of positive/negative similar pairs using prior
    nsp = np.random.binomial(ns, prior**2 / (prior**2 + (1-prior)**2))
    nsn = ns - nsp
    
    # Similar: get positive pairs and negative pairs
    X_s_pos = X_s_set[np.where(y_s_set ==  1)]
    X_s_neg = X_s_set[np.where(y_s_set == -1)]

    # Get similar pairs
    xs = np.concatenate((np.hstack((X_s_pos[np.random.choice(X_s_pos.shape[0], nsp)], 
                                    X_s_pos[np.random.choice(X_s_pos.shape[0], nsp)])),
                         np.hstack((X_s_neg[np.random.choice(X_s_neg.shape[0], nsn)], 
                                    X_s_neg[np.random.choice(X_s_neg.shape[0], nsn)]))))

    # Calculate number of positve/negative unlabeled points using prior
    nup = np.random.binomial(nu, prior)
    nun = nu - nup

    # Unlabeled: get positive pairs and negative pairs
    X_u_pos = X_u_set[np.where(y_u_set ==  1)]
    X_u_neg = X_u_set[np.where(y_u_set == -1)]

    # Get unlabeled points
    xu = np.concatenate((X_u_pos[np.random.choice(X_u_pos.shape[0], nup)], 
                         X_u_neg[np.random.choice(X_u_neg.shape[0], nun)]))
    
    # Sample test data according to class prior
    ntp = int(test_size * prior)
    ntn = test_size - ntp
    X_test_new = np.concatenate((shuffle(X_test[y_test ==  1])[:ntp],
                                 shuffle(X_test[y_test == -1])[:ntn]))
    y_test_new = np.concatenate((np.ones(ntp), -np.ones(ntn)))
    X_test_new, y_test_new = shuffle(X_test_new, y_test_new)

    return xs, xu, X_test_new, y_test_new

In [None]:
# xs, xu = get_similar_unlabeled(X_train, y_train, 500, 500)
# X_train, y_train = convert_su_data_sklearn_compatible(xs, xu)

In [None]:
# %%time

# # prior_estimate = class_prior_estimation(xs, xu)
# est_prior = 0.54
# print('Prior estimate: %.2f\n' % est_prior)

# clf = SU_SL(prior=prior_estimate, lam=1e-04)
# clf.fit(X_train, y_train)

# y_pred = clf.predict(X_test)
# acc = accuracy_score(y_test, y_pred)

# print(max(acc, 1 - acc), 
#       np.unique(y_pred, return_counts=True), 
#       np.unique(y_test, return_counts=True),
#       sep='\n')

In [19]:
acc_ls = []

for i in range(100):

    xs, xu = get_similar_unlabeled(X_train_orig, y_train_orig, 500, 500)
    X_train, y_train = convert_su_data_sklearn_compatible(xs, xu)

    # est_prior = class_prior_estimation(xs, xu)
    est_prior = 0.54

    # cross-validation
    lam_list = [1e-01, 1e-04, 1e-07]
    score_cv_list = []
    for lam in lam_list:
        clf = SU_SL(prior=est_prior, lam=lam)
        score_cv = cross_val_score(clf, X_train, y_train, cv=5).mean()
        score_cv_list.append(score_cv)

    # training with the best hyperparameter
    lam_best = lam_list[np.argmax(score_cv_list)]
    clf = SU_SL(prior=est_prior, lam=lam_best)
    clf.fit(X_train, y_train)

    # test prediction
    y_pred = clf.predict(X_test_orig)
    accuracy = accuracy_score(y_test_orig, y_pred)
    acc_ls.append(max(accuracy, 1 - accuracy))
#     print('%.2f' % acc_ls[i])

#     print(np.unique(y_pred, return_counts=True), 
#           np.unique(y_test_orig, return_counts=True),
#           sep='\n')

np.mean(acc_ls)

0.7935672937771346

In [20]:
np.std(acc_ls) / np.sqrt(100)

0.007278279742348672

In [4]:
xs, xu = get_similar_unlabeled(X_train_orig, y_train_orig, 500, 500, prior=0.7)
X_train, y_train = convert_su_data_sklearn_compatible(xs, xu)

est_prior = class_prior_estimation(xs, xu)
# est_prior = 0.54

# cross-validation
lam_list = [1e-01, 1e-04, 1e-07]
score_cv_list = []
for lam in lam_list:
    clf = SU_SL(prior=est_prior, lam=lam)
    score_cv = cross_val_score(clf, X_train, y_train, cv=5).mean()
    score_cv_list.append(score_cv)

# training with the best hyperparameter
lam_best = lam_list[np.argmax(score_cv_list)]
clf = SU_SL(prior=est_prior, lam=lam_best)
clf.fit(X_train, y_train)

# test prediction
y_pred = clf.predict(X_test_orig)
accuracy = accuracy_score(y_test_orig, y_pred)
print('Accuracy: %.2f' % accuracy)

print(np.unique(y_pred, return_counts=True), 
      np.unique(y_test_orig, return_counts=True),
      sep='\n')

Accuracy: 0.74
(array([-1.,  1.]), array([2687, 2841]))
(array([-1.,  1.]), array([2449, 3079]))


In [5]:
est_prior

0.538161638848609

# New

In [37]:
xs, xu, X_test_new, y_test_new = get_similar_unlabeled(X_train_orig, y_train_orig, X_test_orig, y_test_orig,
                                                       ns=500, nu=500, prior=0.7)
X_train, y_train = convert_su_data_sklearn_compatible(xs, xu)

est_prior = class_prior_estimation(xs, xu)

clf = SU_DH(prior=est_prior, lam=1e-01)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test_new)
accuracy = accuracy_score(y_test_new, y_pred)
print('Accuracy: %.2f' % accuracy)

Accuracy: 0.70


# Simple dataset

In [None]:
def gen1(n, dim, mean=2, var=1):
    return np.random.normal(mean, var, size=(n, dim))


def gen0(n, dim, mean=-2, var=1):
    return np.random.normal(mean, var, size=(n, dim))

In [None]:
from sklearn.utils import shuffle

X_train = np.concatenate((gen0(10000, 2), gen1(10000, 2)))
y_train = np.concatenate((np.ones(10000), -np.ones(10000)))

X_train, y_train = shuffle(X_train, y_train)

xs, xu = get_similar_unlabeled(X_train, y_train, 500, 500)
X_train, y_train = convert_su_data_sklearn_compatible(xs, xu)

clf = SU_DH(prior=0.51, lam=1e-4)
clf.fit(X_train, y_train)

In [None]:
clf.predict(X_train)

# Example

In [None]:
main("double-hinge", prior=0.5, n_s=500, n_u=500, end_to_end=True)

In [None]:
main("double-hinge", prior=0.51, n_s=500, n_u=500, end_to_end=True)

In [None]:
main("double-hinge", prior=0.7, n_s=500, n_u=500, end_to_end=True)

In [None]:
main("double-hinge", prior=0.9, n_s=500, n_u=500, end_to_end=True)