# SVM Ranking Demo
* This demo is a simulation. Imagine a list of N items [1, 2, 3, ... , N] which we know the ground truth ranking for.
* Each item is simply ranked by its index or number.
* Now lets randomly show an SVC a subset of all possible comparisons and let the SVC guess the rankings.
* How good is the generated ranking?

# Generate Data 

In [343]:
from random import random
import numpy as np

In [344]:
N   = 200          # Item count
PR  = 15           # Comparisons per item
NOS = 0.1          # Noise ratio
CPP = 1            # Comparison per pair
TOT = N * PR * CPP # Total # of pairs

In [345]:
# Generate all possible unique pairs and then subsample

# Generate all n * ( n - 1 ) / 2
poss_pairs = [(x, y) for x in range(N) for \
                         y in range(N) if x < y]

# Sample N * PR (items * pairs_per_item) from the total
pairs = rand.sample(poss_pairs, N * PR) * CPP
pairs = np.array(pairs)

pairs[:5]

array([[128, 188],
       [ 92, 161],
       [117, 127],
       [124, 171],
       [ 62,  72]])

# Convert to One-hot

In [346]:
# One-hot encoding

# Function to one-hot encode pairs:
# (0, 3) => [1, 0, 0 ,0] - [0, 0, 0, 1] = [1, 0, 0, -1]
idy = np.identity(N)
vectorise = lambda x:idy[x[0]] - idy[x[1]]

# Encode
X = np.array(list(map(vectorise, pairs)))

X[:5]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.

# Add Noise

In [393]:
# Add noise
y = [1 if (random() > NOS) else -1 for i in range(TOT)]
y = np.array(y).reshape(TOT, 1)

y[:20]

array([[ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [-1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [-1],
       [ 1],
       [-1],
       [ 1],
       [ 1],
       [ 1]])

# Balance Class Labels

In [348]:
# Balance class labels by randomly flipping comparison direction
flip = [1 if (random() > 0.5) else -1 for i in range(TOT)]
flip = np.array(flip).reshape(TOT, 1)

# Flip the pairs
X = np.multiply(flip, X)
y = np.multiply(flip, y)

X[:5]

array([[-0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -1., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0.

# Rank using SVC

In [349]:
from sklearn.svm import LinearSVC

In [350]:
def SVC_rank(X, Y):
    cls = LinearSVC()
    model = cls.fit(X, Y)
    
    res = model.coef_
    ranks = np.flip(np.argsort(res), axis=1)
    
    return ranks

In [351]:
SVC_rank(X, y)

  y = column_or_1d(y, warn=True)


array([[  8,   0,  14,  47,   4,  12,   2,  13,  15,  20,  32,   5,  10,
         19,   3,   7,  24,  11,   9,  36,  29,  30,  45,  26,  37,   6,
         69,  34,  38,  46,  16,  17,  18,   1,  27,  25,  23,  40,  41,
         39,  28,  62,  22,  81,  52,  59,  43,  33,  51,  42,  70,  44,
         21,  53,  58,  31,  56,  49,  55,  54,  35,  67,  79,  65,  57,
         76,  71,  68,  90,  66,  83,  60,  48,  50,  75,  85, 107,  72,
        105,  86,  73, 125,  95,  78,  61,  63,  77, 103,  87, 109,  82,
         80, 117,  74,  89, 104,  93,  97,  91,  96,  84, 106,  98, 108,
         94,  64, 126, 123,  99, 100,  88, 135, 116, 147, 115, 110, 112,
        124, 119, 146, 101, 131, 111, 141, 102, 149, 138, 114,  92, 118,
        127, 120, 143, 133, 113, 139, 129, 136, 154, 140, 153, 155, 134,
        145, 121, 128, 142, 130, 167, 180, 171, 161, 189, 178, 137, 148,
        122, 132, 144, 176, 151, 172, 157, 159, 170, 174, 152, 158, 164,
        185, 175, 181, 173, 162, 156, 163, 192, 150

In [352]:
from scipy.stats import spearmanr
spearmanr(SVC_rank(X, y)[0], range(N))

  y = column_or_1d(y, warn=True)


SpearmanrResult(correlation=0.96413560339008497, pvalue=5.063155444092593e-116)

# Motivation
* The applications (Recommender systems and IR).
* System comparison (Can we do statistical significance?).
* The simulation can be improved by better choice of pairs as the comparisons results are being produced.

# Demotivation
* Does not scale well. The are ways to make it scale for larger items and other methods which sacrafice quality for speed.