# Recall@K with Simple Distance Methods
- Edit Distance
- BLEU

In [44]:
# import modules
import pandas as pd
import numpy as np
import editdistance
import nltk
from modules.evaluation_metrics import recall_at_k

In [39]:
# Read in data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
valid = pd.read_csv("data/valid.csv")

In [40]:
test.head(1)

Unnamed: 0,Context,Ground Truth Utterance,Distractor_0,Distractor_1,Distractor_2,Distractor_3,Distractor_4,Distractor_5,Distractor_6,Distractor_7,Distractor_8
0,anyon know whi my stock oneir export env var u...,nice thank ! __eou__,"wrong channel for it , but check efnet.org , u...","everi time the kernel chang , you will lose vi...",ok __eou__,! nomodeset > acer __eou__ i 'm assum it be a ...,http : //www.ubuntu.com/project/about-ubuntu/d...,thx __eou__ unfortun the program be n't instal...,how can i check ? by do a recoveri for test ? ...,my humbl apolog __eou__,# ubuntu-offtop __eou__


## Edit Distance

In [41]:
def edit_dist_pred(obs):
    
    # list of distances
    dists = [editdistance.eval(obs[0], obs[x]) for x in range(1,11)]
    
    # dataframe for sorting
    sort_df = pd.DataFrame({'choices': list(range(0,10)), 'distances': dists})
    sort_df.sort_values(by='distances', inplace=True)
    return sort_df['choices'].tolist()

In [42]:
answers = []
for index, row in test.iterrows():
    answers.append(edit_dist_pred(row.tolist()))

In [43]:
for k in [1, 2, 5, 10]:
    print("Recall @ {}, 10 total choices: {:g}".format(k, recall_at_k(answers, k)))

Recall @ 1, 10 total choices: 0.133087
Recall @ 2, 10 total choices: 0.242072
Recall @ 5, 10 total choices: 0.542653
Recall @ 10, 10 total choices: 1


## BLEU Score

In [90]:
def bleu_pred(obs):
    
    context = obs[0].split()
    
    # list of scores
    dists = [
        nltk.translate.bleu_score.sentence_bleu(
            obs[x].split(), 
            context, 
            weights=[1] # unigrams
        ) 
        for x in range(1,11)
    ]
    
    # dataframe for sorting
    sort_df = pd.DataFrame({'choices': list(range(0,10)), 'distances': dists})
    sort_df.sort_values(by='distances', inplace=True, ascending=False)
    return sort_df['choices'].tolist()

In [91]:
answers = []
for index, row in test.iterrows():
    if index % 1000 == 0:
        print str(index) + 'th row reached'
    answers.append(bleu_pred(row.tolist()))

0th row reached
1000th row reached
2000th row reached
3000th row reached
4000th row reached
5000th row reached
6000th row reached
7000th row reached
8000th row reached
9000th row reached
10000th row reached
11000th row reached
12000th row reached
13000th row reached
14000th row reached
15000th row reached
16000th row reached
17000th row reached
18000th row reached


In [92]:
for k in [1, 2, 5, 10]:
    print("Recall @ {}, 10 total choices: {:g}".format(k, recall_at_k(answers, k)))

Recall @ 1, 10 total choices: 0.220772
Recall @ 2, 10 total choices: 0.378171
Recall @ 5, 10 total choices: 0.710518
Recall @ 10, 10 total choices: 1
