In [1]:
import choix
import json
import networkx as nx
import numpy as np
import os
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

In [2]:
_RANDOM_STATE = 5

# Data

In [3]:
comparisons = pickle.load(open('data/comparisons_berlin.p', 'rb'))
print(comparisons.shape)

(7281, 12)


In [4]:
n_items = len(pd.unique(comparisons[['image_l', 'image_r']].values.ravel('K')))

In [5]:
images = pd.unique(comparisons[['image_l', 'image_r']].values.ravel('K'))

In [6]:
images_dict = {}
images_dict_rev = {}
for i, image_id in enumerate(images):
    images_dict[image_id] = i
    images_dict_rev[i] = image_id

In [7]:
comparisons=comparisons.replace({"image_l": images_dict})
comparisons=comparisons.replace({"image_r": images_dict})
comparisons=comparisons.replace({"Winner": images_dict})
comparisons=comparisons.replace({"Loser": images_dict})

Split data into training and testing

In [8]:
X_train, X_test, = train_test_split(comparisons, test_size=0.15, random_state=_RANDOM_STATE, )# shuffle=False)

Setup data for training

In [9]:
data = []
for i, row in X_train.iterrows():
    if not row.Tie:
        data.append((int(row.Winner), int(row.Loser)))

    if row.Tie:
        data.append((row.Winner, row.Loser))
        data.append((row.Loser, row.Winner))

# Rank Centrality

## Train

In [10]:
params_rc = choix.rank_centrality(n_items, data, alpha=1e-4)

### Organize scores

In [11]:
scores = []
for i, image_scores in enumerate(params_rc):
    scores.append({
        'image': images_dict_rev[i], 
        'score': image_scores,
        'image_id': images_dict_rev[i],
        'image_path': os.path.join('..', '..', 'cycling_safety_objective', 'data_copy_from_storage','mapillary','berlin', images_dict_rev[i] + '.jpg' )
    })
scores_df = pd.DataFrame(scores).set_index('image', drop=False)

### Compute metrics for training

In [12]:
#def compute_probabilities(rating_a, rating_b, tie_margin=0):
#    tie_margin = np.exp(tie_margin)
#    probability_a = np.exp(rating_a)/(np.exp(rating_a) + tie_margin * np.exp(rating_b))
#    probability_b = np.exp(rating_b)/(np.exp(rating_b) + tie_margin * np.exp(rating_a))
#    probability_tie = ((tie_margin**2 - 1) * (np.exp(rating_a)*np.exp(rating_b))) / ((np.exp(rating_a) + tie_margin * np.exp(rating_b)) * (np.exp(rating_b) + tie_margin * np.exp(rating_a)))
#
#    return probability_a, probability_b, probability_tie

In [13]:
def compute_logloss(df, params):
    log_loss = []

    for i, row in df.iterrows():
        try:
            rating_l = params[int(images_dict[str(row.image_l)])]
            rating_r = params[int(images_dict[str(row.image_r)])]
         
            if row.score == -1:
                p_win, p_los = choix.probabilities([int(images_dict[str(row.image_l)]), int(images_dict[str(row.image_r)])], params)
                # p_win, p_los, p_tie = compute_probabilities(rating_l, rating_r, tie_margin=.2)
            elif row.score == 1:
                p_win, p_los = choix.probabilities([int(images_dict[str(row.image_r)]), int(images_dict[str(row.image_l)])], params)
                # p_win, p_los, p_tie = compute_probabilities(rating_r, rating_l, tie_margin=.2)
            # elif row.score == 0:
            #     # p_win, p_los = choix.probabilities([int(images_dict[str(row.image_r)]), int(images_dict[str(row.image_l)])], params)
            #     p_win, p_los, p_tie = compute_probabilities(rating_r, rating_l, tie_margin=.2)
        except KeyError:
            continue
            
        if row.score == -1 or row.score == 1:
            log_loss.append(np.log(p_win))
    return log_loss

In [14]:
def compute_accuracy(df, params):
    accuracy = []
    
    for i, row in df.iterrows():
        try:
            rating_l = params[int(images_dict[str(row.image_l)])]
            rating_r = params[int(images_dict[str(row.image_r)])]
         
            if row.score == -1:
                p_win, p_los = choix.probabilities([int(images_dict[str(row.image_l)]), int(images_dict[str(row.image_r)])], params)
                # p_win, p_los, p_tie = compute_probabilities(rating_l, rating_r, tie_margin=.2)
            elif row.score == 1:
                p_win, p_los = choix.probabilities([int(images_dict[str(row.image_r)]), int(images_dict[str(row.image_l)])], params)
                # p_win, p_los, p_tie = compute_probabilities(rating_r, rating_l, tie_margin=.2)
            # elif row.score == 0:
            #     # p_win, p_los = choix.probabilities([int(images_dict[str(row.image_r)]), int(images_dict[str(row.image_l)])], params)
            #     p_win, p_los, p_tie = compute_probabilities(rating_r, rating_l, tie_margin=.2)
                
        except KeyError:
            continue
        
        if row.score == -1 or row.score == 1:
            if p_win > p_los:
                accuracy.append(1) 
            else:
                accuracy.append(0)  
    return accuracy

In [15]:
log_loss_train_rc = compute_logloss(X_train[X_train.score != 0], params_rc)
accuracy_train_rc = compute_accuracy(X_train[X_train.score != 0], params_rc)

## Test

### Compute metrics for testing

In [16]:
log_loss_test_rc = compute_logloss(X_test[X_test.score != 0], params_rc)
accuracy_test_rc = compute_accuracy(X_test[X_test.score != 0], params_rc)

## Aggregate results

In [17]:
results = {
    'model': 'rankcentrality',
    'train_logloss': -1 * np.mean(log_loss_train_rc),
    'test_logloss': -1 * np.mean(log_loss_test_rc),
    'train_accuracy': np.mean(accuracy_train_rc),
    'test_accuracy': np.mean(accuracy_test_rc),
    'seed': _RANDOM_STATE
}
pickle.dump(results,  open('output/{}_modelresults_SEED{}.p'.format(results['model'], _RANDOM_STATE), 'wb'))
print(json.dumps(results, indent=4))

{
    "model": "rankcentrality",
    "train_logloss": 0.6931820336485601,
    "test_logloss": 0.6931832033562443,
    "train_accuracy": 0.4797979797979798,
    "test_accuracy": 0.453125,
    "seed": 5
}


In [18]:
pickle.dump(scores_df,  open('output/{}_scores_SEED{}.p'.format(results['model'], _RANDOM_STATE), 'wb'))

# Luce Spectral Ranking

## Train

In [19]:
params_lsr = choix.lsr_pairwise(n_items, data, alpha=1e-4)

### Organize scores

In [20]:
scores = []
for i, image_scores in enumerate(params_lsr):
    scores.append({
        'image': images_dict_rev[i], 
        'score': image_scores,
        'image_id': images_dict_rev[i],
        'image_path': os.path.join('..', '..', 'cycling_safety_objective', 'data_copy_from_storage','mapillary','berlin', images_dict_rev[i] + '.jpg' )
    })
scores_df = pd.DataFrame(scores).set_index('image', drop=False)

### Compute metrics for training

In [21]:
log_loss_train_lsr = compute_logloss(X_train[X_train.score != 0], params_lsr)
accuracy_train_lsr = compute_accuracy(X_train[X_train.score != 0], params_lsr)

## Test

### Compute metrics for testing

In [22]:
log_loss_test_lsr = compute_logloss(X_test[X_test.score != 0], params_lsr)
accuracy_test_lsr = compute_accuracy(X_test[X_test.score != 0], params_lsr)

## Aggregate results

In [23]:
results = {
    'model': 'lucespectralranking',
    'train_logloss': -1 * np.mean(log_loss_train_lsr),
    'test_logloss': -1 * np.mean(log_loss_test_lsr),
    'train_accuracy': np.mean(accuracy_train_lsr),
    'test_accuracy': np.mean(accuracy_test_lsr),
    'seed': _RANDOM_STATE
}
pickle.dump(results,  open('output/{}_modelresults_SEED{}.p'.format(results['model'], _RANDOM_STATE), 'wb'))
print(json.dumps(results, indent=4))

{
    "model": "lucespectralranking",
    "train_logloss": 0.8442209909946851,
    "test_logloss": 0.8318830648552918,
    "train_accuracy": 0.47373737373737373,
    "test_accuracy": 0.4583333333333333,
    "seed": 5
}


In [24]:
pickle.dump(scores_df,  open('output/{}_scores_SEED{}.p'.format(results['model'], _RANDOM_STATE), 'wb'))