# Easily export jupyter cells to python module
https://github.com/fastai/course-v3/blob/master/nbs/dl2/notebook2script.py

In [4]:
! python /tf/main/src/scripts/notebook2script.py evaluation.ipynb

Converted evaluation.ipynb to exp/nb_evaluation.py


In [None]:
#export
# imports
from fastai.text import *

In [None]:
sys.path.append("../../")
from src.proc.exp.nb_proc import *
from src.prep.exp.nb_prep import *

In [None]:
#export
# Evaluation metrics for vulnerability detection - Accuracy, Precision, Recall
def eval_vuln(mdl, tst, sp, task, max_toks):
    tps, tns, fps, fns = 0, 0, 0, 0
    tot = 0
    for inpt, lbl in zip(tst["query"], tst["res"]):
        pred = get_clas_res(mdl, "xxbos " + inpt, sp, task, n_toks = max_toks)
        if lbl == "yes":
            if pred == lbl:
                tps += 1
            else: fns += 1
        else:
            if pred == lbl:
                tns += 1
            else: fps += 1
                
        tot += 1
        torch.cuda.empty_cache()
            
    acc   = (tps + tns) / tot
    prec  = tps / (tps + fps) if (tps + fps) != 0 else 0.
    recal = tps / (tps + fns) if (tps + fns) != 0 else 0.
    
    return acc, prec, recal

In [None]:
! pip install nltk

In [None]:
#export
# Dependency downloads
import nltk
# required for meteor to perform similarity score, etc by looking for synonyms, antonyms...
nltk.download('wordnet')

In [None]:
# samples
reference1 = 'It is a guide to action that ensures that the military will forever heed Party commands'
reference2 = 'It is the guiding principle which guarantees the military forces always being under the command of the Party'
reference3 = 'It is the practical guide for the army always to heed the directions of the party'
exact_reference = 'It is a guide to action which ensures that the military always obeys the commands of the party'
references = [reference1, reference2, reference3, exact_reference]
hypothesis1 = 'It is a guide to action which ensures that the military always obeys the commands of the party'
hypothesis2 = 'It is to insure the troops forever hearing the activity guidebook that party direct'
hypotheses = [hypothesis1, hypothesis2]
bad_reference = ['this is a cat']
bad_sentence = 'non matching hypothesis'
near_hypotheses = ['Here is cat','This is a dog', 'This is a dog.', 'this is a cat']

In [None]:
# comments
comment_ref = ['''/**
* This is a cat
* @param cat is a cat
*/''']
comment_gen = '''/**
* This is a cat
* @param cat is a cat
*/'''

## Bleu Score

In [6]:
#export
from typing import List
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

def _eval_bleu(reference_texts: List[str], generated_text: str, weights: List[int]):
    tokenized_references = [tokenizer.tokenize(reference) for reference in reference_texts]
    tokenized_generated_text = tokenizer.tokenize(generated_text)
    return round(sentence_bleu(
        tokenized_references, 
        tokenized_generated_text, 
        weights=weights),
        4)
    
def eval_bleu1(reference_texts: List[str], generated_text: str):
    return _eval_bleu(reference_texts, 
                      generated_text, 
                      weights = (1,0,0,0))
    
def eval_bleu2(reference_texts: List[str], generated_text: str):
    return _eval_bleu(reference_texts, 
                      generated_text, 
                      weights = (0.5,0.5,0,0))

def eval_bleu3(reference_texts: List[str], generated_text: str):
    return _eval_bleu(reference_texts, 
                      generated_text, 
                      weights = (0.33,0.33,0.33,0))

def eval_bleu4(reference_texts: List[str], generated_text: str):
    return _eval_bleu(reference_texts, 
                      generated_text, 
                      weights = (0.25,0.25,0.25,0.25))

In [None]:
# Sample Bleu Score
bleu_ref = ['this is small test']
bleu_cand = 'this is a test'
print(eval_bleu1(bleu_ref, bleu_cand))
print(eval_bleu2(bleu_ref, bleu_cand))
print(eval_bleu3(bleu_ref, bleu_cand))
print(eval_bleu4(bleu_ref, bleu_cand))

In [9]:
# Sample Bleu Score for case
bleu_ref = ['this is small test']
bleu_cand = 'this is small Test'
print(eval_bleu1(bleu_ref, bleu_cand))
print(eval_bleu2(bleu_ref, bleu_cand))
print(eval_bleu3(bleu_ref, bleu_cand))
print(eval_bleu4(bleu_ref, bleu_cand))

0.75
0.7071
0.6329
0.0


## Meteor Score

In [13]:
#export
from nltk.translate.meteor_score import meteor_score
def eval_meteor(reference_texts: List[str], generated_text: str):
    return round(meteor_score(reference_texts, generated_text, preprocess=str.lower), 4)

In [None]:
#good hypotheses
[eval_meteor(references, hypothesis) for hypothesis in hypotheses]

In [None]:
# bad hypothesis
round(meteor_score(bad_reference, bad_sentence), 4),  eval_meteor(bad_reference, bad_sentence)

In [17]:
# case test meteor
eval_meteor(['This is a cat'], 'this is a cat')

0.9922

In [None]:
eval_meteor(bad_reference, 'this is a cat * * \n   \t  ')

In [None]:
# comment_ref
eval_meteor(comment_ref, comment_gen)

In [None]:
# near hypotheses
[eval_meteor(bad_reference, near_hypothesis) for near_hypothesis in near_hypotheses]

# Rouge-L Metric

In [None]:
#setup 
!pip uninstall -y py-rouge

In [None]:
#export
nltk.download('punkt')

In [10]:
#export
import rouge
import pandas as pd

def _eval_rougeL_single_ref(reference_text: str, generated_text: str):
    evaluator = rouge.Rouge(metrics=['rouge-l'],
                           max_n=4,
#                            limit_length=True,
#                            length_limit=100,
#                            length_limit_type='words',
                           apply_avg=0,
                           apply_best=0,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)
    # scores = evaluator.get_scores(all_hypothesis, all_references)
    # watch out, it takes hypothesis first and then references.
    score = evaluator.get_scores(generated_text, reference_text)['rouge-l'][0]
    score_p = score['p'][0]
    score_f = score['f'][0]
    score_r = score['r'][0]
    return [score_p, score_r, score_f]

def eval_rougeL_single_ref(reference_text: List[str], generated_text: str):
    evaluator = rouge.Rouge(metrics=['rouge-l'],
                           max_n=4,
#                            limit_length=True,
#                            length_limit=100,
#                            length_limit_type='words',
                           apply_avg=0,
                           apply_best=0,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)
    # scores = evaluator.get_scores(all_hypothesis, all_references)
    # watch out, it takes hypothesis first and then references.
    score = evaluator.get_scores(generated_text, reference_text[0])['rouge-l'][0]
    score_p = score['p'][0]
    score_f = score['f'][0]
    score_r = score['r'][0]
    return (score_p, score_r, score_f)

def eval_rougeL(reference_texts: List[str], generated_text: str):
    scores = [
        _eval_rougeL_single_ref(
            reference, 
            generated_text) 
        for reference in reference_texts]
#     return scores
    result_df = pd.DataFrame(scores)
    # be extra careful, mislabeling is going to be really damaging.
    result_df.columns=['p', 'r', 'f']
    return result_df
    

In [11]:
# rouge L case test

eval_rougeL_single_ref(["This is a cat"], "this is a cat")

(1.0, 1.0, 1.0)

In [2]:
eval_rougeL_single_ref(["This is a cat"], "this is a cat")

NameError: name 'eval_rougeL_single_ref' is not defined

In [12]:
#export
def eval_txt(mdl, ds, sp, task, max_toks):
    b1, b2, b3, b4 = [], [], [], []
    meteor = []
    rouge_l = []
    levenshtein = []
    cosine = []
    jaccard = []
    preds = []
    tokenizer = Tokenizer()
    for inpt, lbl in zip(ds["query"], ds["res"]):
        pred = get_seq_res(mdl, "xxbos " + inpt, sp, task, n_toks = max_toks)
        
        lbl = ' '.join(lbl.split())
        preds.append((pred, lbl))
        
        # bleu 1-4
        b1.append(eval_bleu1([lbl], pred))
        b2.append(eval_bleu2([lbl], pred))
        b3.append(eval_bleu3([lbl], pred))
        b4.append(eval_bleu4([lbl], pred))
        
        # meteor
        meteor.append(eval_meteor([lbl], pred))
        
        # Levenshtein
        levenshtein.append(levenshtein_distance_score(lbl, pred))
        
        # Similarities
        cosine.append(get_cosine_sim(lbl, pred))
        jaccard.append(get_jaccard_sim(lbl, pred))
        
        # rouge
        rouge_l.append(eval_rougeL_single_ref([lbl], pred))
        
    return b1, b2, b3, b4, meteor, rouge_l, levenshtein, cosine, jaccard, preds

In [None]:
#export
# Grabs entire model's response up until special xxbos token,
# i.e. once model begins a new sentence we consider the model finished with its answer.
def get_res(mdl, inpt, sp, task, n_toks = 1_000, greedy = False):
    if greedy:
        res = mdl.beam_search(inpt, n_words = n_toks, beam_sz = 1, top_k = 1).split(" ")
        res = sp.DecodePieces(res).split(" ")[1:]
    else:
        res = mdl.predict(inpt, n_toks, temperature=0.75).split(" ")
        res = sp.DecodePieces(res).split(" ")
    
    try:
        end_res = res.index("xxbos")
    except:
        end_res = len(res) - 1
    
    res = " ".join(res[:end_res])
    res = res[res.find(task) + len(task):]
    
    return res

In [None]:
#export
# Grabs entire model's response up until special xxbos token for a sequence task,
# i.e. once model begins a new sentence we consider the model finished with its answer.
def get_seq_res(mdl, inpt, sp, task, n_toks = 1_000):
    res = mdl.predict(inpt, n_toks, temperature=0.75).split(" ")
    res = sp.DecodePieces(res).replace(task, " ").split(" ")[1:]
    
    try:
        end_res = res.index("xxbos")
    except:
        end_res = len(res) - 1
        
    res = decode_spec_tokens(res[:end_res])
    res = " ".join(res[:end_res])
    
    return res

In [None]:
#export
# Grabs entire model's response up until special xxbos token for a classification task,
# i.e. once model begins a new sentence we consider the model finished with its answer.
def get_clas_res(mdl, inpt, sp, task, n_toks = 10):
    res = mdl.beam_search(inpt, n_words = n_toks, beam_sz = 1, top_k = 1).split(" ")
    res = sp.DecodePieces(res).split(" ")[2:]
    
    try:
        end_res = res.index("xxbos")
    except:
        end_res = len(res) - 1
    
    res = " ".join(res[:end_res])
    res = res[res.find(task) + len(task):]
    
    return res

In [None]:
hypothesis_1 = "King Norodom Sihanouk has declined requests to chair a summit of Cambodia 's top political leaders , saying the meeting would not bring any progress in deadlocked negotiations to form a government .\nGovernment and opposition parties have asked King Norodom Sihanouk to host a summit meeting after a series of post-election negotiations between the two opposition groups and Hun Sen 's party to form a new government failed .\nHun Sen 's ruling party narrowly won a majority in elections in July , but the opposition _ claiming widespread intimidation and fraud _ has denied Hun Sen the two-thirds vote in parliament required to approve the next government .\n"
references_1 = ["Prospects were dim for resolution of the political crisis in Cambodia in October 1998.\nPrime Minister Hun Sen insisted that talks take place in Cambodia while opposition leaders Ranariddh and Sam Rainsy, fearing arrest at home, wanted them abroad.\nKing Sihanouk declined to chair talks in either place.\nA U.S. House resolution criticized Hun Sen's regime while the opposition tried to cut off his access to loans.\nBut in November the King announced a coalition government with Hun Sen heading the executive and Ranariddh leading the parliament.\nLeft out, Sam Rainsy sought the King's assurance of Hun Sen's promise of safety and freedom for all politicians.",
                    "Cambodian prime minister Hun Sen rejects demands of 2 opposition parties for talks in Beijing after failing to win a 2/3 majority in recent elections.\nSihanouk refuses to host talks in Beijing.\nOpposition parties ask the Asian Development Bank to stop loans to Hun Sen's government.\nCCP defends Hun Sen to the US Senate.\nFUNCINPEC refuses to share the presidency.\nHun Sen and Ranariddh eventually form a coalition at summit convened by Sihanouk.\nHun Sen remains prime minister, Ranariddh is president of the national assembly, and a new senate will be formed.\nOpposition leader Rainsy left out.\nHe seeks strong assurance of safety should he return to Cambodia.\n",
                    ]

hypothesis_2 = "China 's government said Thursday that two prominent dissidents arrested this week are suspected of endangering national security _ the clearest sign yet Chinese leaders plan to quash a would-be opposition party .\nOne leader of a suppressed new political party will be tried on Dec. 17 on a charge of colluding with foreign enemies of China '' to incite the subversion of state power , '' according to court documents given to his wife on Monday .\nWith attorneys locked up , harassed or plain scared , two prominent dissidents will defend themselves against charges of subversion Thursday in China 's highest-profile dissident trials in two years .\n"
references_2 = "Hurricane Mitch, category 5 hurricane, brought widespread death and destruction to Central American.\nEspecially hard hit was Honduras where an estimated 6,076 people lost their lives.\nThe hurricane, which lingered off the coast of Honduras for 3 days before moving off, flooded large areas, destroying crops and property.\nThe U.S. and European Union were joined by Pope John Paul II in a call for money and workers to help the stricken area.\nPresident Clinton sent Tipper Gore, wife of Vice President Gore to the area to deliver much needed supplies to the area, demonstrating U.S. commitment to the recovery of the region.\n"

all_hypothesis = [hypothesis_1, hypothesis_2]
all_references = [references_1, references_2]

In [None]:
score = eval_rougeL(references, hypothesis1)
score

In [None]:
score = eval_rougeL(comment_ref, comment_gen)
score

In [None]:
score.mean(axis=0), score.std(axis=0), score.median(axis=0)

In [None]:
score.std(axis=0)

In [None]:
#export
# priya

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def _get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

def get_cosine_sim(reference_txt:str, generated_txt:str): 
    vectors = [t for t in _get_vectors(reference_txt, generated_txt)]
    return round(cosine_similarity(vectors[0:1], vectors)[0][1],4)

In [None]:
get_cosine_sim("this is a cat", "this is a bat")

In [112]:
# export
# priya

from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet
lmtzr = WordNetLemmatizer()

def _get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def _lemmatizeSentence(strr):
    return ([lmtzr.lemmatize(word.lower(), _get_wordnet_pos(word)) for word in word_tokenize(strr)])

def get_jaccard_sim(str1, str2):
    a = set(_lemmatizeSentence(str1))
    b = set(_lemmatizeSentence(str2))
    c = a.intersection(b)
    return round(float(len(c)) / (len(a) + len(b) - len(c)),4)


In [113]:
get_jaccard_sim('ai is our friend friend friend and it has been friendly cry', 'AI and humans have always been friendly crying')

0.3846

In [105]:
# export
# # baaler metric.

# !pip install editdistance
import editdistance

def levenshtein_distance_score(reference_txt: str, generated_txt: str):
    return round(editdistance.eval(reference_txt.split(), generated_txt.split()), 4)
    

In [108]:
levenshtein_distance_score("this is a cat", "this is a bat")

1

In [109]:
levenshtein_distance_score("this is a cat", "is a cat this")

2

In [110]:
levenshtein_distance_score("this is a cat", "this is a ")

1

In [111]:
levenshtein_distance_score("this is a cat", "this is a Tom")

1