In [17]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
import sys; sys.path.insert(0,'..')
from main import *

from SU_Classification.su_learning import *
from SU_Classification import misc

In [39]:
X, y = load_svmlight_file('../data/phishing/phishing.txt')
X = X.toarray()

# Convert labels from {0, 1} to {-1, 1}
y = y * 2 - 1

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, stratify=y, 
                                                    shuffle=True, random_state=1)

In [40]:
def get_similar_unlabeled(X_train, y_train, ns, nu, train_size=5000, su_cutoff=2500):
    """ 
    Returns similar and unlabeled pairs: xs, xu
    
    Note) train_size and su_cutoff are used to ensure that similar and unlabeled points
          are from disjoint sets
    """

    # Use prior from dataset
    prior = np.sum(y_train == 1) / y_train.shape[0]
    print('Dataset prior: %.2f' % prior)

    # Only use some of training data
    train_samples = np.random.permutation(X_train.shape[0])[:train_size]
    X_train = X_train[train_samples]
    y_train = y_train[train_samples]

    # Sample similar and unlabeled points from disjoint sets
    X_s_set = X_train[:su_cutoff]
    y_s_set = y_train[:su_cutoff]
    X_u_set = X_train[su_cutoff:]
    y_u_set = y_train[su_cutoff:]

    # Calculate number of positive/negative similar pairs using prior
    nsp = np.random.binomial(ns, prior**2 / (prior**2 + (1-prior)**2))
    nsn = ns - nsp
    
    # Similar: get positive pairs and negative pairs
    X_s_pos = X_s_set[np.where(y_s_set ==  1)]
    X_s_neg = X_s_set[np.where(y_s_set == -1)]

    # Get similar pairs
    xs = np.concatenate((np.hstack((X_s_pos[np.random.choice(X_s_pos.shape[0], nsp)], 
                                    X_s_pos[np.random.choice(X_s_pos.shape[0], nsp)])),
                         np.hstack((X_s_neg[np.random.choice(X_s_neg.shape[0], nsn)], 
                                    X_s_neg[np.random.choice(X_s_neg.shape[0], nsn)]))))

    # Calculate number of positve/negative unlabeled points using prior
    nup = np.random.binomial(nu, prior)
    nun = nu - nup

    # Unlabeled: get positive pairs and negative pairs
    X_u_pos = X_u_set[np.where(y_u_set ==  1)]
    X_u_neg = X_u_set[np.where(y_u_set == -1)]

    # Get unlabeled points
    xu = np.concatenate((X_u_pos[np.random.choice(X_u_pos.shape[0], nup)], 
                         X_u_neg[np.random.choice(X_u_neg.shape[0], nun)]))
    
    return xs, xu

In [41]:
xs, xu = get_similar_unlabeled(X_train, y_train, 500, 500)
X_train, y_train = convert_su_data_sklearn_compatible(xs, xu)

Dataset prior: 0.56


In [46]:
%%time

# prior_estimate = class_prior_estimation(xs, xu)
prior_estimate = 0.538161638848609
print('Prior estimate: %.2f\n' % prior_estimate)

clf = SU_DH(prior=prior_estimate, lam=1e-07)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(max(acc, 1 - acc), 
      np.unique(y_pred, return_counts=True), 
      np.unique(y_test, return_counts=True),
      sep='\n')

Prior estimate: 0.54

0.5569826338639653
(array([1.]), array([5528]))
(array([-1.,  1.]), array([2449, 3079]))
CPU times: user 25.2 s, sys: 240 ms, total: 25.5 s
Wall time: 11.4 s
