Compute f1 scores when:

- two phrases are correctly localized
- only one phrases are correctly localized
- both failed

In [1]:
% cd /home/mayu-ot/durga/Experiments/loc_iparaphrasing/

from chainercv.utils import bbox_iou, non_maximum_suppression
import chainer
import numpy as np
import json
import pandas as pd
import tables
from chainer.dataset.convert import concat_examples
from sklearn.metrics import f1_score, precision_score, recall_score
from func.datasets.datasets import get_agg_roi_df
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.html.widgets import IntProgress
from IPython.display import display
from nltk.metrics import edit_distance

/mnt/fs1/mayu-ot/Experiments/loc_iparaphrasing


  from ._conv import register_converters as _register_converters


In [2]:
from func.datasets.datasets import BBoxDataset, PLCLCBBoxDataset, DDPNBBoxDataset

In [3]:
def get_most_similar(q, targ):
    best_d = np.inf
    for x in targ:
        d = edit_distance(q, x)
        if d < best_d:
            best_d = d
            res = x
        if best_d == 0:
            break
    return res

def lget_bbox(df, img_id, phr):
    item = df[(df.image == img_id) & (df.phrase == phr)]
    if len(item) == 0:
        phrases = df[(df.image == img_id)].phrase.tolist()
        phr_ = get_most_similar(phr, phrases)
        item = df[(df.image == img_id) & (df.phrase == phr_)]
        
    roi = item[['ymin', 'xmin', 'ymax', 'xmax']].values
    return roi

In [4]:
data = PLCLCBBoxDataset('test')
plclc_df = data.df
data = BBoxDataset('test')
gt_df = data.df.reset_index(level=[0,1])
gt_df = gt_df.rename(columns={'org_phrase': 'phrase'})

data = DDPNBBoxDataset('test')
ddpn_df = data.df

In [5]:
df = pd.read_csv('data/phrase_pair_test.csv')
ip = IntProgress(min=0, max=len(df))
display(ip)

plclc_ious1 = []
plclc_ious2 = []
ddpn_ious1 = []
ddpn_ious2 = []

for i, row in df.iterrows():
    im_id, phr1, phr2 = row[['image', 'original_phrase1', 'original_phrase2']]
    
    gt_bbox1 = lget_bbox(gt_df, im_id, phr1)
    gt_bbox2 = lget_bbox(gt_df, im_id, phr2)
    
    p_bbox1 = lget_bbox(plclc_df, im_id, phr1)
    p_bbox2 = lget_bbox(plclc_df, im_id, phr2)
    
    d_bbox1 = lget_bbox(ddpn_df, im_id, phr1.lower())
    d_bbox2 = lget_bbox(ddpn_df, im_id, phr2.lower())
    
    plclc_iou1 = bbox_iou(p_bbox1, gt_bbox1)
    plclc_iou2 = bbox_iou(p_bbox2, gt_bbox2)
    
    plclc_ious1.append(plclc_iou1[0])
    plclc_ious2.append(plclc_iou2[0])
    
    ddpn_iou1 = bbox_iou(d_bbox1, gt_bbox1)
    ddpn_iou2 = bbox_iou(d_bbox2, gt_bbox2)
    
    ddpn_ious1.append(ddpn_iou1[0])
    ddpn_ious2.append(ddpn_iou2[0])
    
    ip.value = i

IntProgress(value=0, max=81285)

In [6]:
plclc_ious1 = np.asarray(plclc_ious1).ravel()
plclc_ious2 = np.asarray(plclc_ious2).ravel()

ddpn_ious1 = np.asarray(ddpn_ious1).ravel()
ddpn_ious2 = np.asarray(ddpn_ious2).ravel()

In [7]:
import seaborn as sns
sns.set()

In [39]:
# both success
both_success = np.logical_and((plclc_ious1 >= .5), (plclc_ious2 >= .5))

# at least one fail
fail = np.logical_or((plclc_ious1 < .5), (plclc_ious2 < .5))

print('both success: %.1f' % (both_success.sum()/ both_success.size))
print('at least one fail: %.1f' % (fail.sum()/ both_success.size))

both success: 0.3
at least one fail: 0.7


In [44]:
# PLCLC
files = [
    'bo_out/vis+plclc/25-20181111-060511/res_test.csv',
    'bo_out/vis+lng+plclc+mult/17-20181108-050238/res_test.csv'
]

results = {'method': [], 's:f1':[], 's:prec':[], 's:rec':[], 'f:f1':[], 'f:prec':[], 'f:rec':[]}

for f in files:
    df = pd.read_csv(f)
    ypred = df.ypred.values
    ytrue = df.ytrue.values
    
    results['method'].append(f.split('/')[1])

    f1 = f1_score(ytrue[both_success], ypred[both_success])
    prec = precision_score(ytrue[both_success], ypred[both_success])
    rec = recall_score(ytrue[both_success], ypred[both_success])
    
    results['s:f1'].append(f1*100)
    results['s:prec'].append(prec*100)
    results['s:rec'].append(rec*100)    

    f1 = f1_score(ytrue[fail], ypred[fail])
    prec = precision_score(ytrue[fail], ypred[fail])
    rec = recall_score(ytrue[fail], ypred[fail])
    
    results['f:f1'].append(f1*100)
    results['f:prec'].append(prec*100)
    results['f:rec'].append(rec*100)

    f1 = f1_score(ytrue, ypred)
    prec = precision_score(ytrue, ypred)
    rec = recall_score(ytrue, ypred)
    print('overall: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))
    
pd.DataFrame(results)

overall: f1 0.58, prec 0.52, rec 0.65
overall: f1 0.85, prec 0.83, rec 0.87


Unnamed: 0,f:f1,f:prec,f:rec,method,s:f1,s:prec,s:rec
0,31.526287,27.002181,37.871503,vis+plclc,79.284538,74.208462,85.106041
1,75.666667,72.292994,79.370629,vis+lng+plclc+mult,92.359551,92.270686,92.448586


In [11]:
# Ours with plclc
df = pd.read_csv('bo_out/vis+lng+plclc+mult/17-20181108-050238/res_test.csv')
ypred = df.ypred.values
ytrue = df.ytrue.values

f1 = f1_score(ytrue[both_success], ypred[both_success])
prec = precision_score(ytrue[both_success], ypred[both_success])
rec = recall_score(ytrue[both_success], ypred[both_success])
print('both success: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue[one_success], ypred[one_success])
prec = precision_score(ytrue[one_success], ypred[one_success])
rec = recall_score(ytrue[one_success], ypred[one_success])
print('one success: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue[both_fail], ypred[both_fail])
prec = precision_score(ytrue[both_fail], ypred[both_fail])
rec = recall_score(ytrue[both_fail], ypred[both_fail])
print('both fail: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

f1 = f1_score(ytrue, ypred)
prec = precision_score(ytrue, ypred)
rec = recall_score(ytrue, ypred)
print('overall: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))

both success: f1 0.87, prec 0.86, rec 0.89
one success: f1 0.54, prec 0.46, rec 0.65
both fail: f1 0.80, prec 0.82, rec 0.79
overall: f1 0.85, prec 0.83, rec 0.87


In [29]:
# both success
both_success = np.logical_and((ddpn_ious1 >= .5), (ddpn_ious2 >= .5))

# at least one fail
fail = np.logical_or((ddpn_ious1 < .5), (ddpn_ious2 < .5))

print('both success: %.1f' % (both_success.sum()/ both_success.size))
print('at least one fail: %.1f' % (fail.sum()/ both_success.size))

both success: 0.7
at least one fail: 0.3


In [30]:
df = pd.read_csv('data/phrase_pair_test.csv', index_col=0)
labels = df.loc[both_success, ['ytrue']]
print('both_success: %.2f' % (sum(labels.values)/len(labels)))
labels = df.loc[fail, ['ytrue']]
print('at least one fail: %.2f' % (sum(labels.values)/len(labels)))

both_success: 0.17
at least one fail: 0.06


In [41]:
# DDPN
files = [
    'bo_out/vis_ddpn/7-20181107-221739/res_test.csv',
    'bo_out/vis+lng+ddpn+mult/18-20181108-015934/res_test.csv'
]

results = {'method': [], 's:f1':[], 's:prec':[], 's:rec':[], 'f:f1':[], 'f:prec':[], 'f:rec':[]}

for f in files:
    df = pd.read_csv(f)
    ypred = df.ypred.values
    ytrue = df.ytrue.values
    
    results['method'].append(f.split('/')[1])

    f1 = f1_score(ytrue[both_success], ypred[both_success])
    prec = precision_score(ytrue[both_success], ypred[both_success])
    rec = recall_score(ytrue[both_success], ypred[both_success])
    
    results['s:f1'].append(f1*100)
    results['s:prec'].append(prec*100)
    results['s:rec'].append(rec*100)    

    f1 = f1_score(ytrue[fail], ypred[fail])
    prec = precision_score(ytrue[fail], ypred[fail])
    rec = recall_score(ytrue[fail], ypred[fail])
    
    results['f:f1'].append(f1*100)
    results['f:prec'].append(prec*100)
    results['f:rec'].append(rec*100)

    f1 = f1_score(ytrue, ypred)
    prec = precision_score(ytrue, ypred)
    rec = recall_score(ytrue, ypred)
    print('overall: f1 %.2f, prec %.2f, rec %.2f' % (f1, prec, rec))
    
pd.DataFrame(results)

overall: f1 0.66, prec 0.61, rec 0.73
overall: f1 0.86, prec 0.86, rec 0.87


Unnamed: 0,f:f1,f:prec,f:rec,method,s:f1,s:prec,s:rec
0,52.781872,45.621019,62.609266,vis_ddpn,77.820104,75.38786,80.414524
1,78.181623,76.741739,79.676573,vis+lng+ddpn+mult,92.694688,92.73195,92.657455
