In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import sys; sys.path.insert(0,'..')
from main import *

In [2]:
data = pd.read_csv('../data/magic/magic04.data')
X = data.iloc[:, :-1].values
y  = data.iloc[:, -1].values

In [3]:
# Convert labels to -1, 1 and scale features
y = np.array([-1.0 if yi == 'g' else 1.0 for yi in y])

scaler = MinMaxScaler(feature_range=(0, 1))

for i in range(X.shape[1]):
    X[:, i] = scaler.fit_transform(X[:, i].reshape(-1, 1)).ravel()
    
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, stratify=y, 
                                                    shuffle=True, random_state=1)

# Only use some of train/test data
TRAIN_SIZE = 1000
TEST_SIZE  = 500

train_samples = np.random.permutation(X_train.shape[0])[:TRAIN_SIZE]
X_train = X_train[train_samples]
y_train = y_train[train_samples]

test_samples = np.random.permutation(X_test.shape[0])[:TEST_SIZE]
X_test = X_test[test_samples]
y_test = y_test[test_samples]

In [4]:
np.random.RandomState(seed=42).permutation(10)

array([8, 1, 5, 0, 7, 2, 9, 4, 3, 6])

In [5]:
def generate_svm_rank_W(X_train, y_train, m, kernel='linear', random_seed=1):
    """ Generates pairwise comparison matrix W using Mohammad's method: learned linear SVM ranker trained on m pairs"""

    np.random.seed(random_seed)

    pos_idx = np.squeeze(np.argwhere(y_train == 1))
    neg_idx = np.squeeze(np.argwhere(y_train == -1))
    pos_idx = pos_idx[np.random.permutation(pos_idx.shape[0])[:m]]
    neg_idx = neg_idx[np.random.permutation(neg_idx.shape[0])[:m]]

    cut = np.min(pos_idx.size, neg_idx.size)
    if cut != m:
        print("Cut at %d rather than expected %d" % (cut, m))

    rnd_c = np.random.choice([1, -1], [pos_idx.shape[0], 1])[:cut]
    X_pair = (X_train[pos_idx,:][:cut] - X_train[neg_idx,:][:cut]) * rnd_c
    y_pair = (y_train[pos_idx][:cut] - y_train[neg_idx][:cut]) * np.squeeze(rnd_c) / 2

    clf = SVC(C=1.0, kernel=kernel)
    clf.fit(X_pair, y_pair)

    print("Training accuracy of rank SVM: %.2f" % clf.score(X_pair, y_pair))

    rank = clf.decision_function(X_train)
    sigma = 1
    W = gen_conf_matrix(rank, sigma)

    return W

In [9]:
# W_ = generate_svm_rank_W(X_train, y_train, 200)

In [79]:
n_s = 200

pos_ind = np.squeeze(np.argwhere(y_train == 1))
neg_ind = np.squeeze(np.argwhere(y_train == -1))
pos_ind = pos_ind[np.random.permutation(pos_ind.shape[0])[:n_s]]
neg_ind = neg_ind[np.random.permutation(neg_ind.shape[0])[:n_s]]

rnd_c = np.random.choice([1,-1],[pos_ind.shape[0],1])
X_pair = (X_train[pos_ind,:] - X_train[neg_ind,:]) * rnd_c
y_pair = rnd_c.ravel()

In [80]:
from sklearn.svm import SVC

clf = SVC(C=1.0, kernel='linear')
clf.fit(X_pair, y_pair)

clf.score(X_pair, y_pair)

0.795

In [39]:
clf.decision_function(X_train)

array([ 2.44870657e-01,  5.26528445e-01,  6.39066483e-02,  5.19706644e-01,
        6.89866183e-01,  2.45425604e-01,  2.28993253e-01,  3.57556650e-01,
        2.76043134e-01,  6.05359147e-01,  4.81567724e-01, -1.39288211e-01,
        5.01909849e-02,  5.83718330e-01,  3.73666728e+00,  2.75701368e-01,
        1.06672661e+00,  1.22940767e+00, -8.33521378e-02,  1.09169607e+00,
        1.45853880e+00,  1.73900141e-01, -4.36603555e-02,  7.88174653e-01,
        7.61809058e-01,  1.58086279e+00,  9.89019141e-01,  1.66029649e-01,
        6.44999211e-01,  2.41351372e-01,  1.39945683e+00,  2.55678269e-01,
        1.68141584e-01,  6.10645595e-01,  3.02042717e-01,  1.29710103e-01,
        2.98201630e-01,  2.87309548e+00,  1.71604895e+00,  1.84636529e-02,
        5.59083548e-01,  3.45507572e-01,  1.60263266e+00,  5.40023135e-01,
        4.53650053e-02, -1.76650280e-01,  2.60348357e-01,  2.49016843e-01,
       -2.06663618e-01,  9.01370249e-02,  3.65843546e+00,  1.51592234e-01,
        4.66302191e-01, -

In [40]:
ranks = [clf.decision_function(X_train)]
sigma_k = [5]

W = [gen_conf_matrix(rank, sigma) for rank,sigma in zip(ranks,sigma_k)][0]

In [20]:
svm_lambdaboost(X_train, y_train, X_test, y_test, W)

LINEAR SVM SUBMODELS
Using 20 classifiers and sample proportion of 1
t	Train		Test
1	0.74		0.75
2	0.74		0.75
3	0.74		0.75
4	0.74		0.75
5	0.74		0.75
6	0.74		0.75
7	0.74		0.75
8	0.74		0.75
9	0.74		0.74
10	0.74		0.74
11	0.74		0.74
12	0.74		0.74
13	0.74		0.74
14	0.74		0.74
15	0.74		0.74
16	0.74		0.74
17	0.74		0.74
18	0.73		0.74
19	0.73		0.74
20	0.73		0.74
t = 2 was best iteration



(0.255, 0.248)

    BASELINES ACCURACIES
    Guess Majority:	 0.65
    KNN:		     0.83
    Linear SVM:	     0.79
    Random Forest:	 0.75
    XGBoost:	     0.85