In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
import sys; sys.path.insert(0,'..')
from main import *

from SU_Classification.su_learning import SU_SL, SU_DH, convert_su_data_sklearn_compatible
from SU_Classification import misc

In [3]:
X_train, y_train = load_svmlight_file('../data/adult/a5a.txt', n_features=123)
X_train = X_train.toarray()

X_test, y_test = load_svmlight_file('../data/adult/a5a.t')
X_test = X_test.toarray()

# SU Compare

In [4]:
n_s = 500
n_u = 500

# Sizes of sets to sample similar, unlabeled points
train_size = 5000
su_cutoff  = 2500

# Only use some of training data
train_samples = np.random.permutation(X_train.shape[0])[:train_size]
X_train = X_train[train_samples]
y_train = y_train[train_samples]

# Sample similar and unlabeled points from disjoint sets
X_s_set = X_train[:su_cutoff]
y_s_set = y_train[:su_cutoff]
X_u_set = X_train[su_cutoff:]
y_u_set = y_train[su_cutoff:]

# Get positive pairs and negative pairs
X_pos = X_s_set[np.where(y_s_set ==  1)]
X_neg = X_s_set[np.where(y_s_set == -1)]

X_pos_idx = np.random.choice(X_pos.shape[0], size=(int(n_s / 2), 2))
X_neg_idx = np.random.choice(X_neg.shape[0], size=(int(n_s / 2), 2))

# Fill in similar pairs
X_s = np.full((n_s, X_train.shape[1] * 2), np.NaN)

k = 0
for (i, j) in X_pos_idx:
    X_s[k] = np.hstack((X_pos[i], X_pos[j]))
    k += 1
    
for (i, j) in X_neg_idx:
    X_s[k] =np.hstack((X_neg[i], X_neg[j]))
    k += 1

# Fill in unlabeled samples
unlabeled_samples = np.random.permutation(X_u_set.shape[0])[:n_u]
X_u = X_u_set[unlabeled_samples]

X_train, y_train = convert_su_data_sklearn_compatible(X_s, X_u)

In [5]:
%%time
clf = SU_SL(prior=0.3, lam=1e-04)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred), np.unique(y_pred, return_counts=True), np.unique(y_test, return_counts=True)

CPU times: user 58.8 ms, sys: 14.2 ms, total: 73 ms
Wall time: 42.3 ms


(0.45726087122805675,
 (array([-1.,  1.]), array([15716, 10431])),
 (array([-1.,  1.]), array([19875,  6272])))

In [6]:
y_test, y_pred

(array([-1., -1., -1., ..., -1., -1.,  1.]),
 array([-1., -1., -1., ...,  1., -1., -1.]))

# New

In [7]:
def gen(n, dim, mean=2, var=1):
    return np.random.normal(mean, var, size=(n, dim))

def synth_dataset(ns, nu, prior, nt, dim=2, mp=2):
    nsp = np.random.binomial(ns, prior**2 / (prior**2 + (1-prior)**2))
    nsn = ns - nsp
    xsp = np.hstack((gen(nsp, dim,mean=mp), gen(nsp, dim,mean=mp)))
    xsn = np.hstack((gen(nsn, dim, mean=-mp), gen(nsn, dim, mean=-mp)))
    xs = np.concatenate((xsp,xsn))
    xsr = np.concatenate((xsp.reshape(-1, dim),xsn.reshape(-1, dim)))
    yr = np.concatenate((np.ones(2*nsp),-np.ones(2*nsn)))
    

    nup = np.random.binomial(nu, prior)
    nun = nu - nup
    xu = np.concatenate((gen(nup, dim,mean=mp), gen(nun, dim,mean=-mp)))
    yu = np.concatenate((np.ones(nup), -np.ones(nun)))
    
    x_train = np.concatenate((xsr,xu))
    y_train = np.concatenate((yr,yu))
    x_train,y_train = shuffle(x_train,y_train)
    
    
    ntp = np.random.binomial(nt, prior)
    ntn = nt - ntp
    x_test = np.concatenate((gen(ntp, dim,mean=mp), gen(ntn, dim,mean=-mp)))
    y_test = np.concatenate((np.ones(ntp), -np.ones(ntn)))

    return xs, xu, x_train, y_train, x_test, y_test

x_s, x_u, x_train, y_train, x_test, y_test = synth_dataset(ns=n_s, nu=n_u, prior=0.6, nt=100, mp=1.0)

In [8]:
%%time
x_train_su, y_train_su = convert_su_data_sklearn_compatible(x_s, x_u)

clf = SU_SL(prior=0.7, lam=1e-01)
clf.fit(x_train_su, y_train_su)

y_pred = clf.predict(x_test).ravel()
accuracy = accuracy_score(y_test, y_pred)
accuracy

CPU times: user 2.89 ms, sys: 1.33 ms, total: 4.23 ms
Wall time: 2.96 ms


0.86

In [9]:
np.unique(y_pred, return_counts=True), np.unique(y_test, return_counts=True)

((array([-1.,  1.]), array([34, 66])), (array([-1.,  1.]), array([46, 54])))

In [10]:
y_pred, y_test

(array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1.,  1.,  1.,
         1.,  1., -1.,  1., -1.,  1., -1., -1.,  1., -1., -1., -1.,  1.,
        -1., -1., -1., -1., -1., -1., -1.,  1.,  1., -1., -1.,  1., -1.,
        -1., -1., -1., -1.,  1.,  1., -1., -1.,  1., -1., -1., -1., -1.,
         1., -1.,  1., -1., -1., -1., -1., -1.,  1.]),
 array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -

## From github

In [11]:
n_s = 500
n_u = 500

# Sizes of sets to sample similar, unlabeled points
train_size = 5000
su_cutoff  = 2500

# Only use some of training data
train_samples = np.random.permutation(X_train.shape[0])[:train_size]
X_train = X_train[train_samples]
y_train = y_train[train_samples]

# Sample similar and unlabeled points from disjoint sets
X_s_set = X_train[:su_cutoff]
y_s_set = y_train[:su_cutoff]
X_u_set = X_train[su_cutoff:]
y_u_set = y_train[su_cutoff:]

# Get positive pairs and negative pairs
X_pos = X_s_set[np.where(y_s_set ==  1)]
X_neg = X_s_set[np.where(y_s_set == -1)]

X_pos_idx = np.random.choice(X_pos.shape[0], size=(int(n_s / 2), 2))
X_neg_idx = np.random.choice(X_neg.shape[0], size=(int(n_s / 2), 2))

# Fill in similar pairs
X_s = np.full((n_s, X_train.shape[1] * 2), np.NaN)

k = 0
for (i, j) in X_pos_idx:
    X_s[k] = np.hstack((X_pos[i], X_pos[j]))
    k += 1
    
for (i, j) in X_neg_idx:
    X_s[k] =np.hstack((X_neg[i], X_neg[j]))
    k += 1

# Fill in unlabeled samples
unlabeled_samples = np.random.permutation(X_u_set.shape[0])[:n_u]
X_u = X_u_set[unlabeled_samples]

X_train, y_train = convert_su_data_sklearn_compatible(X_s, X_u)

In [12]:
# ns = 500
# nu = 500
# prior = 0.3

prior = 0.7

ns = 500
nu = 500

# Sizes of sets to sample similar, unlabeled points
train_size = 5000
su_cutoff  = 2500

# Only use some of training data
train_samples = np.random.permutation(X_train.shape[0])[:train_size]
X_train = X_train[train_samples]
y_train = y_train[train_samples]

# Sample similar and unlabeled points from disjoint sets
X_s_set = X_train[:su_cutoff]
y_s_set = y_train[:su_cutoff]
X_u_set = X_train[su_cutoff:]
y_u_set = y_train[su_cutoff:]

# Similar: get positive pairs and negative pairs
X_s_pos = X_s_set[np.where(y_s_set ==  1)]
X_s_neg = X_s_set[np.where(y_s_set == -1)]

nsp = np.random.binomial(ns, prior**2 / (prior**2 + (1-prior)**2))
nsn = ns - nsp
# xs = np.concatenate((
#     np.hstack((gen1(nsp, dim), gen1(nsp, dim))),
#     np.hstack((gen0(nsn, dim), gen0(nsn, dim)))))

xs = np.concatenate((np.hstack((X_s_pos[np.random.choice(X_s_pos.shape[0], nsp)], 
                               X_s_pos[np.random.choice(X_s_pos.shape[0], nsp)])),
                    np.hstack((X_s_neg[np.random.choice(X_s_neg.shape[0], nsn)], 
                               X_s_neg[np.random.choice(X_s_neg.shape[0], nsn)]))))

nup = np.random.binomial(nu, prior)
nun = nu - nup
# xu = np.concatenate((gen1(nup, dim), gen0(nun, dim)))

X_u_pos = X_u_set[np.where(y_u_set ==  1)]
X_u_neg = X_u_set[np.where(y_u_set == -1)]

xu = np.concatenate((X_u_pos[np.random.choice(X_u_pos.shape[0], nup)], 
                     X_u_neg[np.random.choice(X_u_neg.shape[0], nun)]))

ValueError: a must be greater than 0 unless no samples are taken

In [None]:
%%time
X_train, y_train = convert_su_data_sklearn_compatible(xs, xu)

clf = SU_SL(prior=0.7, lam=1e-04)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred), np.unique(y_pred, return_counts=True), np.unique(y_test, return_counts=True)

In [None]:
xs.shape, xu.shape

In [None]:
def synth_dataset(ns, nu, prior, dim=2):
    nsp = np.random.binomial(ns, prior**2 / (prior**2 + (1-prior)**2))
    nsn = ns - nsp
    xs = np.concatenate((
        np.hstack((gen1(nsp, dim), gen1(nsp, dim))),
        np.hstack((gen0(nsn, dim), gen0(nsn, dim)))))

    nup = np.random.binomial(nu, prior)
    nun = nu - nup
    xu = np.concatenate((gen1(nup, dim), gen0(nun, dim)))

    return xs, xu