In [1]:
import choix
import json
import networkx as nx
import numpy as np
import os
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

In [2]:
_RANDOM_STATE = 5

# Data

In [3]:
comparisons = pickle.load(open('data/comparisons_berlin.p', 'rb'))
print(comparisons.shape)

(7281, 12)


In [4]:
n_items = len(pd.unique(comparisons[['image_l', 'image_r']].values.ravel('K')))

In [5]:
images = pd.unique(comparisons[['image_l', 'image_r']].values.ravel('K'))

In [6]:
images_dict = {}
images_dict_rev = {}
for i, image_id in enumerate(images):
    images_dict[image_id] = i
    images_dict_rev[i] = image_id

In [7]:
comparisons=comparisons.replace({"image_l": images_dict})
comparisons=comparisons.replace({"image_r": images_dict})
comparisons=comparisons.replace({"Winner": images_dict})
comparisons=comparisons.replace({"Loser": images_dict})

In [8]:
comparisons

Unnamed: 0,index,datetime,user,image_i,image_j,score,dataset,image_l,image_r,Winner,Loser,Tie
0,406,2022-09-06 17:13:23,cycling9334a308469b956854470ed3668c578f7c99fa3...,berlin/209.jpg,berlin/7819.jpg,1,berlin,0,3146,3146,0,0
1,407,2022-09-06 17:13:33,cycling9334a308469b956854470ed3668c578f7c99fa3...,berlin/210.jpg,berlin/2123.jpg,1,berlin,1,698,698,1,0
2,408,2022-09-06 17:13:43,cycling9334a308469b956854470ed3668c578f7c99fa3...,berlin/211.jpg,berlin/5265.jpg,-1,berlin,2,3058,2,3058,0
3,409,2022-09-06 17:13:52,cycling9334a308469b956854470ed3668c578f7c99fa3...,berlin/212.jpg,berlin/2024.jpg,-1,berlin,3,1117,3,1117,0
4,410,2022-09-06 17:14:10,cycling9334a308469b956854470ed3668c578f7c99fa3...,berlin/213.jpg,berlin/9692.jpg,1,berlin,4,3147,3147,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
44259,12186,2023-05-18 23:12:16,1684450793634,berlin/1809.jpg,berlin/4312.jpg,1,berlin,587,1306,1306,587,0
44306,12233,2023-05-18 23:14:44,1684451648518,berlin/4313.jpg,berlin/5627.jpg,-1,berlin,1307,1848,1307,1848,0
44321,12248,2023-05-18 23:15:20,1684451648518,berlin/3887.jpg,berlin/4317.jpg,-1,berlin,2566,1308,2566,1308,0
44323,12250,2023-05-18 23:15:28,1684451648518,berlin/2900.jpg,berlin/4325.jpg,0,berlin,2502,1309,1309,2502,1


Split data into training and testing

In [9]:
X_train, X_test, = train_test_split(comparisons, test_size=0.15, random_state=_RANDOM_STATE, )# shuffle=False)

Setup data for training

In [10]:
data = []
for i, row in X_train.iterrows():
    if not row.Tie:
        data.append((int(row.Winner), int(row.Loser)))

    if row.Tie:
        data.append((row.Winner, row.Loser))
        data.append((row.Loser, row.Winner))

# Rank Centrality

## Train

In [11]:
params_rc = choix.rank_centrality(n_items, data, alpha=1e-4)

### Organize scores

In [12]:
scores = []
for i, image_scores in enumerate(params_rc):
    scores.append({
        'image': images_dict_rev[i], 
        'score': image_scores,
        'image_id': images_dict_rev[i],
        'image_path': os.path.join('..', '..', 'cycling_safety_objective', 'data_copy_from_storage','mapillary','berlin', images_dict_rev[i] + '.jpg' )
    })
scores_df = pd.DataFrame(scores).set_index('image', drop=False)

In [13]:
scores_df

Unnamed: 0_level_0,image,score,image_id,image_path
image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
209,209,-4.459122e-04,209,../../cycling_safety_objective/data_copy_from_...
210,210,-8.922514e-04,210,../../cycling_safety_objective/data_copy_from_...
211,211,-1.338491e-03,211,../../cycling_safety_objective/data_copy_from_...
212,212,-8.929934e-04,212,../../cycling_safety_objective/data_copy_from_...
213,213,-1.337894e-03,213,../../cycling_safety_objective/data_copy_from_...
...,...,...,...,...
9475,9475,4.467661e-04,9475,../../cycling_safety_objective/data_copy_from_...
9477,9477,3.274052e-07,9477,../../cycling_safety_objective/data_copy_from_...
9479,9479,-8.923513e-04,9479,../../cycling_safety_objective/data_copy_from_...
9874,9874,3.274052e-07,9874,../../cycling_safety_objective/data_copy_from_...


### Compute metrics for training

In [14]:
def compute_logloss(df, params):
    log_loss = []

    for i, row in df.iterrows():      
        if row.score == -1:
            p_win, p_los = choix.probabilities([int(row.image_l), int(row.image_r)], params)
        elif row.score == 1:
            p_win, p_los = choix.probabilities([int(row.image_r), int(row.image_l)], params)
            
        if row.score == -1 or row.score == 1:
            log_loss.append(np.log(p_win))
            
    return log_loss

In [15]:
def compute_accuracy(df, params):
    accuracy = []
    
    for i, row in df.iterrows():
        if row.score == -1:
            p_win, p_los = choix.probabilities([int(row.image_l), int(row.image_r)], params)
        elif row.score == 1:
            p_win, p_los = choix.probabilities([int(row.image_r), int(row.image_l)], params)

        if row.score == -1 or row.score == 1:
            if p_win > p_los:
                accuracy.append(1) 
            else:
                accuracy.append(0)  

    return accuracy

In [16]:
log_loss_train_rc = compute_logloss(X_train[X_train.score != 0], params_rc)
accuracy_train_rc = compute_accuracy(X_train[X_train.score != 0], params_rc)

## Test

### Compute metrics for testing

In [17]:
log_loss_test_rc = compute_logloss(X_test[X_test.score != 0], params_rc)
accuracy_test_rc = compute_accuracy(X_test[X_test.score != 0], params_rc)

## Aggregate results

In [18]:
results = {
    'model': 'rankcentrality',
    'train_logloss': -1 * np.mean(log_loss_train_rc),
    'test_logloss': -1 * np.mean(log_loss_test_rc),
    'train_accuracy': np.mean(accuracy_train_rc),
    'test_accuracy': np.mean(accuracy_test_rc),
    'seed': _RANDOM_STATE
}
pickle.dump(results,  open('output/{}_modelresults_SEED{}.p'.format(results['model'], _RANDOM_STATE), 'wb'))
print(json.dumps(results, indent=4))

{
    "model": "rankcentrality",
    "train_logloss": 0.6924919192478953,
    "test_logloss": 0.6929265476900345,
    "train_accuracy": 0.888623707239459,
    "test_accuracy": 0.6459276018099548,
    "seed": 5
}


In [19]:
pickle.dump(scores_df,  open('output/{}_scores_SEED{}.p'.format(results['model'], _RANDOM_STATE), 'wb'))

# Luce Spectral Ranking

## Train

In [20]:
params_lsr = choix.lsr_pairwise(n_items, data, alpha=1e-4)

### Organize scores

In [21]:
scores = []
for i, image_scores in enumerate(params_lsr):
    scores.append({
        'image': images_dict_rev[i], 
        'score': image_scores,
        'image_id': images_dict_rev[i],
        'image_path': os.path.join('..', '..', 'cycling_safety_objective', 'data_copy_from_storage','mapillary','berlin', images_dict_rev[i] + '.jpg' )
    })
scores_df = pd.DataFrame(scores).set_index('image', drop=False)

### Compute metrics for training

In [22]:
log_loss_train_lsr = compute_logloss(X_train[X_train.score != 0], params_lsr)
accuracy_train_lsr = compute_accuracy(X_train[X_train.score != 0], params_lsr)

## Test

### Compute metrics for testing

In [23]:
log_loss_test_lsr = compute_logloss(X_test[X_test.score != 0], params_lsr)
accuracy_test_lsr = compute_accuracy(X_test[X_test.score != 0], params_lsr)

## Aggregate results

In [24]:
results = {
    'model': 'lucespectralranking',
    'train_logloss': -1 * np.mean(log_loss_train_lsr),
    'test_logloss': -1 * np.mean(log_loss_test_lsr),
    'train_accuracy': np.mean(accuracy_train_lsr),
    'test_accuracy': np.mean(accuracy_test_lsr),
    'seed': _RANDOM_STATE
}
pickle.dump(results,  open('output/{}_modelresults_SEED{}.p'.format(results['model'], _RANDOM_STATE), 'wb'))
print(json.dumps(results, indent=4))

{
    "model": "lucespectralranking",
    "train_logloss": 0.34809064003606344,
    "test_logloss": 0.6270193981520985,
    "train_accuracy": 0.9311853619729514,
    "test_accuracy": 0.6481900452488688,
    "seed": 5
}


In [25]:
pickle.dump(scores_df,  open('output/{}_scores_SEED{}.p'.format(results['model'], _RANDOM_STATE), 'wb'))