In [1]:
import cvxpy as cp
import json
import pickle
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
_RANDOM_STATE = 1

# Data

In [3]:
comparisons = pickle.load(open('data/comparisons_berlin.p', 'rb'))
print(comparisons.shape)

(7281, 12)


In [4]:
comparisons_df_no_ties = comparisons[comparisons.score != 0]
comparisons_df_ties = comparisons[comparisons.score == 0]

In [5]:
images_list = pd.unique(comparisons[['image_l', 'image_r']].values.ravel('K'))

Split data into training and testing

In [6]:
X_train, X_test, = train_test_split(comparisons, test_size=0.15, random_state=_RANDOM_STATE, )# shuffle=False)
print('Train:', X_train.shape)
print('Test:', X_test.shape)

Train: (6188, 12)
Test: (1093, 12)


Split draws

Unique images

In [7]:
images = pd.unique(comparisons[['image_l', 'image_r']].values.ravel('K'))

# Optimization problem

## Create variables

$m$ : number of images 

$n$ : number of non-tie comparisons

$x$ : image scores

$t$ : auxiliar variable because of linearization

In [8]:
m = images.shape[0]
n = X_train.shape[0] # comparisons_df_no_ties.shape[0]

x = cp.Variable(m)
t = cp.Variable(n)

In [9]:
bn = np.zeros((n, m))

Create $b_n$ (non-ties comparisons)

In [10]:
i = 0
# for _, row in comparisons_df_no_ties.iterrows():
for _, row in X_train.iterrows():
    # Indexes of winning and losing images
    image_l = np.where(images == row.image_l)
    image_r = np.where(images == row.image_r)

    # Set wining and losing images in bn
    if row.score == -1:
        bn[i, image_l] = 1
        bn[i, image_r] = -1
    elif row.score == 1:
        bn[i, image_l] = -1
        bn[i, image_r] = 1
    
    i += 1

Create $b_n$ for ties

In [11]:
# b for ties
B = np.zeros((comparisons_df_ties.shape[0], m))

i = 0
for _, row in comparisons_df_ties.iterrows():
    # Indexes of winning and losing images
    image_l = np.where(images == row.image_l)
    image_r = np.where(images == row.image_r)

    # Set wining and losing images in bn
    if row.score == 0:
        B[i, image_l] = 1
        B[i, image_r] = -1
    
    i += 1

$e$ : error term

$\lambda_{ties}$ : weight for tie comparisons

In [12]:
e = 1e-10
lambda_ties = 1

## Problem's objective

$$ 1^Tt + \lambda_{ties} |Bx|$$



In [13]:
B.shape, x.shape

((1369, 4481), (4481,))

In [14]:
objective = cp.Minimize(np.ones(n).T @ t + lambda_ties * np.ones(comparisons_df_ties.shape[0]).T @ cp.abs(B @ x))
# objective = cp.Minimize(np.ones(n).T @ t)

## Problem's constraints

In [15]:
constraints = []

$$1^Tx = 0$$

In [16]:
constraints.append(np.ones(m).T @ x == 0)

$$\epsilon - b_nx <= t$$

In [17]:
constraints.append(e - bn @ x <= t)

$$0<=t$$

In [18]:
constraints.append(0 <= t)

## Problem

In [19]:
prob = cp.Problem(objective, constraints)

## Solve

In [20]:
result = prob.solve()
print(constraints[0].dual_value)

3.15087703229256e-19


$$ 
x
$$

$$ 
t
$$

In [21]:
print(t.value)

[-3.40274170e-13 -3.29404290e-13 -3.38624037e-13 ... -3.51525460e-13
 -3.55168311e-13  4.98520517e-11]


# Scores

In [22]:
scores_df = pd.DataFrame(x.value, index=images).rename(columns={0: 'score'})

In [23]:
scores_df['image_path'] = images
scores_df['image_path'] = scores_df['image_path'].apply(lambda x: os.path.join('images', 'berlin', str(x) + '.jpg' ))

In [24]:
scores_df['image'] = images

# Metrics

In [25]:
def compute_probabilities(rating_a, rating_b, tie_margin=0):
    tie_margin = np.exp(tie_margin)
    probability_a = np.exp(rating_a)/(np.exp(rating_a) + tie_margin * np.exp(rating_b))
    probability_b = np.exp(rating_b)/(np.exp(rating_b) + tie_margin * np.exp(rating_a))
    probability_tie = ((tie_margin**2 - 1) * (np.exp(rating_a)*np.exp(rating_b))) / ((np.exp(rating_a) + tie_margin * np.exp(rating_b)) * (np.exp(rating_b) + tie_margin * np.exp(rating_a)))

    return probability_a, probability_b, probability_tie


In [26]:
# WITHOUT TIES
def compute_probabilities(rating_a, rating_b):
    probability_a = np.exp(rating_a)/(np.exp(rating_a) + np.exp(rating_b))
    probability_b = np.exp(rating_b)/(np.exp(rating_b) + np.exp(rating_a))

    return probability_a, probability_b   

# WITH TIES
#def compute_probabilities(rating_a, rating_b, tie_margin=0):
#    tie_margin = np.exp(tie_margin)
#    probability_a = np.exp(rating_a)/(np.exp(rating_a) + tie_margin * np.exp(rating_b))
#    probability_b = np.exp(rating_b)/(np.exp(rating_b) + tie_margin * np.exp(rating_a))
#    probability_tie = ((tie_margin**2 - 1) * (np.exp(rating_a)*np.exp(rating_b))) / ((np.exp(rating_a) + tie_margin * np.exp(rating_b)) * (np.exp(rating_b) + tie_margin * np.exp(rating_a)))
#
#    return probability_a, probability_b, probability_tie


In [27]:
def compute_logloss(df):
    log_loss = []
    
    for i, row in df.iterrows():
        #p_win, p_los, p_tie = compute_probabilities(scores_df.loc[row.Winner].score, scores_df.loc[row.Loser].score, tie_margin=.2)
        p_win, p_los = compute_probabilities(scores_df.loc[row.Winner].score, scores_df.loc[row.Loser].score)

        if row.score == -1 or row.score == 1:
            log_loss.append(np.log(p_win))
        else:
            log_loss.append(np.log(p_tie))
    
    return log_loss

In [28]:
def compute_accuracy(df):
    accuracy = []
    
    for i, row in df.iterrows():
        #p_win, p_los, p_tie = compute_probabilities(scores_df.loc[row.Winner].score, scores_df.loc[row.Loser].score, tie_margin=.2)
        p_win, p_los = compute_probabilities(scores_df.loc[row.Winner].score, scores_df.loc[row.Loser].score)

        if row.score == -1 or row.score == 1:
            if p_win > p_los:
                accuracy.append(1) 
            else:
                accuracy.append(0)

    return accuracy

### Compute metrics for training

In [29]:
log_loss_train = compute_logloss(X_train[X_train.score != 0])
accuracy_train = compute_accuracy(X_train[X_train.score != 0])

## Test

### Compute metrics for testing

In [30]:
log_loss_test = compute_logloss(X_test[X_test.score != 0])
accuracy_test = compute_accuracy(X_test[X_test.score != 0])

## Aggregate results

In [31]:
results = {
    'model': 'cvxopt',
    'train_logloss': -1 * np.mean(log_loss_train),
    'test_logloss': -1 * np.mean(log_loss_test),
    'train_accuracy': np.mean(accuracy_train),
    'test_accuracy': np.mean(accuracy_test),
    'seed': _RANDOM_STATE
}
pickle.dump(results,  open('output/{}_modelresults_SEED{}.p'.format(results['model'], _RANDOM_STATE), 'wb'))
print(json.dumps(results, indent=4))

{
    "model": "cvxopt",
    "train_logloss": 0.5551159504360182,
    "test_logloss": 0.6500955728193558,
    "train_accuracy": 0.9775124378109453,
    "test_accuracy": 0.6324689966178129,
    "seed": 1
}


In [32]:
pickle.dump(scores_df,  open('output/{}_scores_SEED{}.p'.format(results['model'], _RANDOM_STATE), 'wb'))