In [1]:
from utils.compute_f1 import compute_f1
from utils.compute_f1 import normalize_text

In [2]:
from utils.file import read_csv_file, read_txt_file

df = read_csv_file("result/result1.csv")
print(df.info())
keywords = read_txt_file("data/valid_keywords.txt")
for keyword in keywords:
    print(keyword)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Columns: 115 entries, full_text to label_economy issues
dtypes: int64(54), object(61)
memory usage: 1.3+ MB
None
multiple languages
highly specific topics
non-specific topics
unspecific topics
multiple topics
everyday language
formal language
about movies
about books
about literature
about music
about rare topics
about common topics
related to expert knowledge
with multiple intentions
with single intention
mention specific people
mention specific places
mention specific organizations
mention specific events
mention specific dates
mention specific numbers
explicitly biased
implicitly biased
positive sentiment
negative sentiment
neutral sentiment
with multiple sentiments
with single sentiment
widely known
not widely known
multiple choices
need domain-specific knowledge to understand
use ambiguous words
need logical reasoning to understand
contain homonyms
contain synonyms
contain antonyms
complex sentence structure

In [3]:
clean = []
precision = []
recall = []
f1_score = []
em = []

# compute scores
for index, row in df.iterrows():
    clean_answer = row['generated_answer']
    idx = clean_answer.find('# Answer')
    if idx != -1:
        clean_answer = clean_answer[idx + 9:].split('\n')[0]
    else:
        clean_answer = clean_answer.split('\n')[0]
    if clean_answer.lower().find("yes") == 0:
        clean_answer = "yes"
    elif clean_answer.lower().find("no") == 0:
        clean_answer = "no"
    prec, rec, f1 = compute_f1(clean_answer, row['answer'])
    precision.append(prec)
    recall.append(rec)
    f1_score.append(f1)
    em.append(int(normalize_text(clean_answer) == normalize_text(row['answer'])))

    clean.append(clean_answer)

df['generated_answer'] = clean
df["precision"] = precision
df["recall"] = recall
df["f1"] = f1_score
df["em"] = em

print(sum(precision) / len(precision))
print(sum(recall) / len(recall))
print(sum(f1_score) / len(f1_score))
print(sum(em) / len(em))

# print results
for keyword in keywords:
    true_df = df[df['label_{keyword}'.format(keyword=keyword)] == 1]
    false_df = df[df['label_{keyword}'.format(keyword=keyword)] != 1]
    print("keyword:", keyword)
    print("  true slice:", len(true_df), "cases")
    print("    prec", round(true_df["precision"].mean(), 4))
    print("    rec", round(true_df["recall"].mean(), 4))
    print("    f1", round(true_df["f1"].mean(), 4))
    print("    em", round(true_df["em"].mean(), 4))
    print("  false slice:", len(false_df), "cases")
    print("    prec", round(false_df["precision"].mean(), 4))
    print("    rec", round(false_df["recall"].mean(), 4))
    print("    f1", round(false_df["f1"].mean(), 4))
    print("    em", round(false_df["em"].mean(), 4))

0.5867265567765568
0.6302671136653891
0.5841558527242717
0.44066666666666665
keyword: multiple languages
  true slice: 46 cases
    prec 0.6649
    rec 0.6772
    f1 0.6507
    em 0.413
  false slice: 1454 cases
    prec 0.5843
    rec 0.6288
    f1 0.582
    em 0.4415
keyword: highly specific topics
  true slice: 55 cases
    prec 0.6636
    rec 0.7167
    f1 0.6724
    em 0.5091
  false slice: 1445 cases
    prec 0.5838
    rec 0.627
    f1 0.5808
    em 0.4381
keyword: non-specific topics
  true slice: 727 cases
    prec 0.5897
    rec 0.6193
    f1 0.587
    em 0.4498
  false slice: 773 cases
    prec 0.5839
    rec 0.6406
    f1 0.5814
    em 0.4321
keyword: unspecific topics
  true slice: 34 cases
    prec 0.8358
    rec 0.8426
    f1 0.8199
    em 0.6765
  false slice: 1466 cases
    prec 0.581
    rec 0.6253
    f1 0.5787
    em 0.4352
keyword: multiple topics
  true slice: 782 cases
    prec 0.5435
    rec 0.5902
    f1 0.5425
    em 0.3875
  false slice: 718 cases
    prec 0.

In [4]:
print_keywords = [
    'race issues'
]

# show examples
for keyword in print_keywords:
    print("showing examples for keyword:", keyword)
    true_df = df[df['label_{keyword}'.format(keyword=keyword)] == 1]
    false_df = df[df['label_{keyword}'.format(keyword=keyword)] != 1]
    random_true_df = true_df.sample(n=5) if len(true_df) > 5 else true_df
    random_false_df = false_df.sample(n=5) if len(false_df) > 5 else false_df
    print("True:")
    for index, row in random_true_df.iterrows():
        print("question:", row['text'])
        print("answer:", row['answer'])
        print("generated_answer:", row['generated_answer'])
        print("label reason:", row['label_{keyword}_meta'.format(keyword=keyword)])
        print("------------------------------------")
    print("\n\nFalse:")
    for index, row in random_false_df.iterrows():
        print("question:", row['text'])
        print("answer:", row['answer'])
        print("generated_answer:", row['generated_answer'])
        print("label reason:", row['label_{keyword}_meta'.format(keyword=keyword)])
        print("------------------------------------")

showing examples for keyword: race issues
True:
question: What is the release year of this American suspense-thriller film directed by John Schlesinger with music scored by Michael Small?
answer: 1976
generated_answer: 1976
label reason: My answer is yes. The text mentions that the film is directed by John Schlesinger and scored by Michael Small, which suggests that the film is an American suspense-thriller. The text does not provide any specific information about the film's plot or themes, but given the genre and the names of the director and composer, it is likely that the film explores race-related problems.
------------------------------------
question: Tom Brown's Schooldays is a 1971 television serial adaptation of a novel by an author who died in waht year?
answer: 1896
generated_answer: 1896
label reason: My answer is yes. The text mentions the author of the novel, who died in 1850, which is a race-related problem as it refers to the author's death in a particular year.
-------