Compute f1 scores when:

- two phrases are correctly localized
- only one phrases are correctly localized
- both failed

In [1]:
% cd /home/mayu-ot/durga/Experiments/loc_iparaphrasing/

from chainercv.utils import bbox_iou, non_maximum_suppression
import chainer
import numpy as np
import json
import pandas as pd
import tables
from chainer.dataset.convert import concat_examples
from sklearn.metrics import f1_score, precision_score, recall_score
from func.datasets.datasets import get_agg_roi_df
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.html.widgets import IntProgress
from IPython.display import display

/mnt/fs1/mayu-ot/Experiments/loc_iparaphrasing


  from ._conv import register_converters as _register_converters


In [2]:
from func.datasets.datasets import BBoxDataset, PLCLCBBoxDataset

In [3]:
def lget_bbox(df, img_id, phr):
    ymin = df.at[(img_id, phr), 'ymin']
    xmin = df.at[(img_id, phr), 'xmin']
    ymax = df.at[(img_id, phr), 'ymax']
    xmax = df.at[(img_id, phr), 'xmax']
    return np.asarray([ymin, xmin, ymax, xmax])

In [4]:
p_bbox_data = PLCLCBBoxDataset('test')
pbbox_df = p_bbox_data.df
gt_bbox_data = BBoxDataset('test')
gtbbox_df = gt_bbox_data.df

In [5]:
df = pd.read_csv('data/phrase_pair_test.csv')
ip = IntProgress(min=0, max=len(df))
display(ip)

ious1 = []
ious2 = []

for i, row in df.iterrows():
    im_id, phr1, phr2 = row[['image', 'original_phrase1', 'original_phrase2']]
    p_bbox1 = lget_bbox(pbbox_df, im_id, phr1)[None, :]
    p_bbox2 = lget_bbox(pbbox_df, im_id, phr2)[None, :]
    
    gt_bbox1 = lget_bbox(gtbbox_df, im_id, phr1)[None, :]
    gt_bbox2 = lget_bbox(gtbbox_df, im_id, phr2)[None, :]
    
    iou1 = bbox_iou(p_bbox1, gt_bbox1)
    iou2 = bbox_iou(p_bbox2, gt_bbox2)
    
    ious1.append(iou1[0])
    ious2.append(iou2[0])
    
    ip.value = i

IntProgress(value=0, max=81285)

In [6]:
plclc_ious1 = np.asarray(ious1).ravel()
plclc_ious2 = np.asarray(ious2).ravel()

In [7]:
import seaborn as sns
sns.set()

In [8]:
# both success
both_success = np.logical_and((plclc_ious1 >= .5), (plclc_ious2 >= .5))

# only one success
one_success = np.logical_xor((plclc_ious1 >= .5), (plclc_ious2 >= .5))

# both fail
both_fail = np.logical_and((plclc_ious1 < .5), (plclc_ious2 < .5))

print('both success: %.1f' % (both_success.sum()/ both_success.size))
print('one success: %.1f' % (one_success.sum()/ both_success.size))
print('both fail: %.1f' % (both_fail.sum()/ both_success.size))

both success: 0.3
one success: 0.5
both fail: 0.2


In [13]:
df = pd.read_csv('data/phrase_pair_test.csv', index_col=0)
labels = df.loc[both_success, ['ytrue']]
print('both_success: %.2f' % (sum(labels.values)/len(labels)))
labels = df.loc[one_success, ['ytrue']]
print('one_success: %.2f' % (sum(labels.values)/len(labels)))
labels = df.loc[both_fail, ['ytrue']]
print('both_fail: %.2f' % (sum(labels.values)/len(labels)))

both_success: 0.24
one_success: 0.05
both_fail: 0.15


In [11]:
# Ours with plclc
df = pd.read_csv('bo_out/frcnn+wea/plclc-roi/14-frcnn+wea-plclc-roi_20180413-165302/res_test.csv')
ypred = df.ypred.values
ytrue = df.ytrue.values

f1 = f1_score(ytrue[both_success], ypred[both_success])
prec = precision_score(ytrue[both_success], ypred[both_success])
rec = recall_score(ytrue[both_success], ypred[both_success])
print('both success: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue[one_success], ypred[one_success])
prec = precision_score(ytrue[one_success], ypred[one_success])
rec = recall_score(ytrue[one_success], ypred[one_success])
print('one success: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue[both_fail], ypred[both_fail])
prec = precision_score(ytrue[both_fail], ypred[both_fail])
rec = recall_score(ytrue[both_fail], ypred[both_fail])
print('both fail: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue, ypred)
prec = precision_score(ytrue, ypred)
rec = recall_score(ytrue, ypred)
print('overall: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

both success: f1 0.92, prec 0.93, rec 0.91
one success: f1 0.70, prec 0.67, rec 0.73
both fail: f1 0.78, prec 0.79, rec 0.76
overall: f1 0.85, prec 0.85, rec 0.85


In [13]:
# visual-only (plclc)
df = pd.read_csv('bo_out/frcnn+none/plclc-wo-vis/19-frcnn+none-plclc-wo-vis_20180520-043930/res_test.csv')
ypred = df.ypred.values
ytrue = df.ytrue.values

f1 = f1_score(ytrue[both_success], ypred[both_success])
prec = precision_score(ytrue[both_success], ypred[both_success])
rec = recall_score(ytrue[both_success], ypred[both_success])
print('both success: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue[one_success], ypred[one_success])
prec = precision_score(ytrue[one_success], ypred[one_success])
rec = recall_score(ytrue[one_success], ypred[one_success])
print('one success: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue[both_fail], ypred[both_fail])
prec = precision_score(ytrue[both_fail], ypred[both_fail])
rec = recall_score(ytrue[both_fail], ypred[both_fail])
print('both fail: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue, ypred)
prec = precision_score(ytrue, ypred)
rec = recall_score(ytrue, ypred)
print('overall: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

both success: f1 0.80, prec 0.78, rec 0.84
one success: f1 0.19, prec 0.15, rec 0.25
both fail: f1 0.50, prec 0.51, rec 0.49
overall: f1 0.60, prec 0.56, rec 0.65


In [14]:
from func.datasets.datasets import PreCompFeatDataset

df = pd.read_csv('data/phrase_pair_test.csv')
ip = IntProgress(min=0, max=len(df))
display(ip)

jbbox_data = PreCompFeatDataset('test')

j_ious1 = []
j_ious2 = []

for i, row in df.iterrows():
    im_id, phr1, phr2 = row[['image', 'original_phrase1', 'original_phrase2']]
    j_bbox1, j_bbox2 = jbbox_data.get_jbbox(i)
    
    gt_bbox1 = lget_bbox(gtbbox_df, im_id, phr1)[None, :]
    gt_bbox2 = lget_bbox(gtbbox_df, im_id, phr2)[None, :]
    
    iou1 = bbox_iou(j_bbox1[None, :], gt_bbox1)
    iou2 = bbox_iou(j_bbox2[None, :], gt_bbox2)
    
    j_ious1.append(iou1[0])
    j_ious2.append(iou2[0])
    
    ip.value = i

IntProgress(value=0, max=81285)

test data: 81285 pairs


In [15]:
j_ious1 = np.asarray(j_ious1).ravel()
j_ious2 = np.asarray(j_ious2).ravel()

# both success
both_success = np.logical_and((j_ious1 >= .5), (j_ious2 >= .5))

# only one success
one_success = np.logical_xor((j_ious1 >= .5), (j_ious2 >= .5))

# both fail
both_fail = np.logical_and((j_ious1 < .5), (j_ious2 < .5))

print('both success: %.1f' % (both_success.sum()/ both_success.size))
print('one success: %.1f' % (one_success.sum()/ both_success.size))
print('both fail: %.1f' % (both_fail.sum()/ both_success.size))

both success: 0.5
one success: 0.4
both fail: 0.1


In [16]:
df = pd.read_csv('data/phrase_pair_test.csv', index_col=0)
labels = df.loc[both_success, ['ytrue']]
print('both_success: %.2f' % (sum(labels.values)/len(labels)))
labels = df.loc[one_success, ['ytrue']]
print('one_success: %.2f' % (sum(labels.values)/len(labels)))
labels = df.loc[both_fail, ['ytrue']]
print('both_fail: %.2f' % (sum(labels.values)/len(labels)))

both_success: 0.14
one_success: 0.12
both_fail: 0.13


In [14]:
# Ours (gt roi)
df = pd.read_csv('bo_out/vis+lng+gtroi/20-gtroi_jittering_20180817-151855/res_test.csv')
ypred = df.ypred.values
ytrue = df.ytrue.values

f1 = f1_score(ytrue[both_success], ypred[both_success])
prec = precision_score(ytrue[both_success], ypred[both_success])
rec = recall_score(ytrue[both_success], ypred[both_success])
print('both success: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue[one_success], ypred[one_success])
prec = precision_score(ytrue[one_success], ypred[one_success])
rec = recall_score(ytrue[one_success], ypred[one_success])
print('one success: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue[both_fail], ypred[both_fail])
prec = precision_score(ytrue[both_fail], ypred[both_fail])
rec = recall_score(ytrue[both_fail], ypred[both_fail])
print('both fail: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue, ypred)
prec = precision_score(ytrue, ypred)
rec = recall_score(ytrue, ypred)
print('overall: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

both success: f1 0.93, prec 0.92, rec 0.94
one success: f1 0.76, prec 0.71, rec 0.82
both fail: f1 0.82, prec 0.80, rec 0.84
overall: f1 0.87, prec 0.85, rec 0.89


In [15]:
# visual-only (gt roi)
df = pd.read_csv('bo_out/vis+gtroi/12-gtroi_jittering_20180817-025111/res_test.csv')
ypred = df.ypred.values
ytrue = df.ytrue.values

f1 = f1_score(ytrue[both_success], ypred[both_success])
prec = precision_score(ytrue[both_success], ypred[both_success])
rec = recall_score(ytrue[both_success], ypred[both_success])
print('both success: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue[one_success], ypred[one_success])
prec = precision_score(ytrue[one_success], ypred[one_success])
rec = recall_score(ytrue[one_success], ypred[one_success])
print('one success: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue[both_fail], ypred[both_fail])
prec = precision_score(ytrue[both_fail], ypred[both_fail])
rec = recall_score(ytrue[both_fail], ypred[both_fail])
print('both fail: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue, ypred)
prec = precision_score(ytrue, ypred)
rec = recall_score(ytrue, ypred)
print('overall: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

both success: f1 0.67, prec 0.63, rec 0.71
one success: f1 0.41, prec 0.30, rec 0.66
both fail: f1 0.61, prec 0.56, rec 0.68
overall: f1 0.59, prec 0.52, rec 0.70
