In [1]:
import json
import pandas as pd

In [2]:
from IPython.display import display, HTML


pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

def pp(df):
    return display(HTML(df.to_html().replace("\\n","<br>")))

In [3]:
NEW_RELATIONS = ['[causes]', '[enables]', '[prevents]']


def fmt_answer(s):
    relation = next(r for r in NEW_RELATIONS if r in s)
    c, e = s.split(relation)
    return f'{c}\n{relation}\n{e}'


def multiple(s): 
    return sum(s.count(r) for r in NEW_RELATIONS) > 1


def read_json(path):
    with open(path) as f:
        j = json.load(f)

    label = pd.DataFrame.from_records(j['label_ids'])
    pred = pd.DataFrame.from_records(j['predictions'])
    join = pd.merge(label, pred, on='id')
    
    df = join[['answers', 'prediction_text']].rename(columns={
        'answers': 'gold',
        'prediction_text': 'pred',
    })
    return df


def process_df(df):
    df['gold'] = df['gold'].map(lambda x: x.split('\n'))
    df['pred'] = df['pred'].map(lambda x: x.split('\n'))
    len_match = df['gold'].map(len) == df['pred'].map(len)
    
    unmatched = df[~len_match].copy()
    unmatched['correct'] = False
    unmatched['type'] = 'umatched'
    unmatched['gold'] = unmatched['gold'].map(lambda x: '\n'.join(x))
    unmatched['pred'] = unmatched['pred'].map(lambda x: '\n'.join(x))
    
    matching = df[len_match].explode(['gold', 'pred'])
    matching['correct'] = matching['gold'] == matching['pred']
    
    multis = matching[matching['pred'].map(multiple)].copy()
    multis['type'] = 'multis'
    
    valid = matching[~matching['pred'].map(multiple)].copy()
    valid['gold'] = valid['gold'].map(fmt_answer)
    valid['pred'] = valid['pred'].map(fmt_answer)
    valid['type'] = 'valid'
    
    # Valid: entries where there both gold and pred have the same number of 
    #     clauses, and each close is valid (one tag only)
    # Unmatched: entries where the number of clauses differs between gold
    #     and pred
    # Multis: entries where the number of clauses matches, but they contain
    #     more than one tag
    return valid, unmatched, multis


df = read_json('no_prompt/predict_outputs.json')
no_prompt, unmatch, multis = process_df(df)
pp(no_prompt.head())

Unnamed: 0,gold,pred,correct,type
0,"BB&T and SunTrust have completed their merger, forming Truist [enables]  which we believe will drive the next step up in profitability for the franchises","BB&T and SunTrust have completed their merger, forming Truist [enables]  drive the next step up in profitability for the franchises",False,valid
1,Given Tulip's lack of profitability (management has stated the business was not profitable at the time of the October 2019 acquisition) [enables]  we do not believe the business maintains a cost advantage,Tulip's lack of profitability [causes]  we do not believe the business maintains a cost advantage,False,valid
2,pipeline transportation costs are not tied to the price of natural gas and crude oil [causes]  TC Energy's profitability is not directly tied to commodity prices,pipeline transportation costs are not tied to the price of natural gas and crude oil [causes]  TC Energy's profitability is not directly tied to commodity prices,True,valid
3,Wynn has recently renovated rooms and the gaming floor space at its peninsula property [enables]  help the facility maintain market share over the next few years,Wynn has recently renovated rooms and the gaming floor space at its peninsula property [enables]  help the facility maintain market share over the next few years,True,valid
4,recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat COVID-19 [causes]  We are keeping our fair value estimate for Danaher at $160 per share after raising it in January 2021,recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat COVID-19 [causes]  We are keeping our fair value estimate for Danaher at $160 per share after raising it in January 2021,True,valid


In [4]:
pp(no_prompt[~no_prompt['correct']].head())

Unnamed: 0,gold,pred,correct,type
0,"BB&T and SunTrust have completed their merger, forming Truist [enables]  which we believe will drive the next step up in profitability for the franchises","BB&T and SunTrust have completed their merger, forming Truist [enables]  drive the next step up in profitability for the franchises",False,valid
1,Given Tulip's lack of profitability (management has stated the business was not profitable at the time of the October 2019 acquisition) [enables]  we do not believe the business maintains a cost advantage,Tulip's lack of profitability [causes]  we do not believe the business maintains a cost advantage,False,valid
5,management will continue to pursue acquisition targets with high levels of recurring revenue [prevents]  the cyclicality of Fortive's portfolio,management will continue to pursue acquisition targets with high levels of recurring revenue [causes]  further lower the cyclicality of Fortive's portfolio,False,valid
6,"Beyond's products emit 90% less greenhouse gases, require 93% less land, 99% less water, and 46% less energy to produce [causes]  the 20% of consumers willing to adjust their habits to benefit the environment","Beyond's products emit 90% less greenhouse gases, require 93% less land, 99% less water, and 46% less energy to produce than their meat equivalents [causes]  a primary growth driver to be the 20% of consumers willing to adjust their habits to benefit the environment",False,valid
9,"This is largely attributable to greater capital efficiency and business mix tilted toward credit cards [causes]  drive the bank's exceptional net interest margins, which routinely exceed those of money center banks by 600 basis points",greater capital efficiency and business mix tilted toward credit cards [causes]  drive the bank's exceptional net interest margins,False,valid


In [5]:
allx = pd.concat((no_prompt, unmatch, multis))
pp(allx.head())

Unnamed: 0,gold,pred,correct,type
0,"BB&T and SunTrust have completed their merger, forming Truist [enables]  which we believe will drive the next step up in profitability for the franchises","BB&T and SunTrust have completed their merger, forming Truist [enables]  drive the next step up in profitability for the franchises",False,valid
1,Given Tulip's lack of profitability (management has stated the business was not profitable at the time of the October 2019 acquisition) [enables]  we do not believe the business maintains a cost advantage,Tulip's lack of profitability [causes]  we do not believe the business maintains a cost advantage,False,valid
2,pipeline transportation costs are not tied to the price of natural gas and crude oil [causes]  TC Energy's profitability is not directly tied to commodity prices,pipeline transportation costs are not tied to the price of natural gas and crude oil [causes]  TC Energy's profitability is not directly tied to commodity prices,True,valid
3,Wynn has recently renovated rooms and the gaming floor space at its peninsula property [enables]  help the facility maintain market share over the next few years,Wynn has recently renovated rooms and the gaming floor space at its peninsula property [enables]  help the facility maintain market share over the next few years,True,valid
4,recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat COVID-19 [causes]  We are keeping our fair value estimate for Danaher at $160 per share after raising it in January 2021,recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat COVID-19 [causes]  We are keeping our fair value estimate for Danaher at $160 per share after raising it in January 2021,True,valid


In [6]:
dd = allx.groupby(['correct', 'type']).size().unstack(0).fillna(0)
dd/dd.to_numpy().sum()

correct,False,True
type,Unnamed: 1_level_1,Unnamed: 2_level_1
multis,0.040037,0.0
umatched,0.123371,0.0
valid,0.591713,0.244879


In [11]:
pp(no_prompt[~no_prompt['correct']].head(20))

Unnamed: 0,gold,pred,correct,type
0,"BB&T and SunTrust have completed their merger, forming Truist [enables]  which we believe will drive the next step up in profitability for the franchises","BB&T and SunTrust have completed their merger, forming Truist [enables]  drive the next step up in profitability for the franchises",False,valid
1,Given Tulip's lack of profitability (management has stated the business was not profitable at the time of the October 2019 acquisition) [enables]  we do not believe the business maintains a cost advantage,Tulip's lack of profitability [causes]  we do not believe the business maintains a cost advantage,False,valid
5,management will continue to pursue acquisition targets with high levels of recurring revenue [prevents]  the cyclicality of Fortive's portfolio,management will continue to pursue acquisition targets with high levels of recurring revenue [causes]  further lower the cyclicality of Fortive's portfolio,False,valid
6,"Beyond's products emit 90% less greenhouse gases, require 93% less land, 99% less water, and 46% less energy to produce [causes]  the 20% of consumers willing to adjust their habits to benefit the environment","Beyond's products emit 90% less greenhouse gases, require 93% less land, 99% less water, and 46% less energy to produce than their meat equivalents [causes]  a primary growth driver to be the 20% of consumers willing to adjust their habits to benefit the environment",False,valid
9,"This is largely attributable to greater capital efficiency and business mix tilted toward credit cards [causes]  drive the bank's exceptional net interest margins, which routinely exceed those of money center banks by 600 basis points",greater capital efficiency and business mix tilted toward credit cards [causes]  drive the bank's exceptional net interest margins,False,valid
12,evidence that Gap lacks a competitive edge [enables]  its operating margins and returns on invested capital have declined since 2013,Gap lacks a competitive edge [causes]  its operating margins and returns on invested capital have declined since 2013,False,valid
13,people start opting to use debit cards [prevents]  Discover's ability to generate interest income,people start opting to use debit cards rather than credit cards [causes]  eat into Discover's ability to generate interest income,False,valid
14,ethylene oxide becomes a priority for state officials | licenses for these facilities are given by state and local officials [causes]  Teleflex could face supply disruptions,"licenses for these facilities are given by state and local officials [causes]  if ethylene oxide becomes a priority for state officials, Teleflex could face supply disruptions",False,valid
17,We still view Polaris as more seasonally and end-user diversified [causes]  BRP's normalized operating margins could trail slightly behind,"We still view Polaris as more seasonally and end-user diversified (it sells electric vehicles, military products, motorcycles, and more) [causes]  BRP's normalized operating margins could trail slightly behind",False,valid
18,"The commodity products ADM moves around the world are readily available from competitors, and the company has little pricing power over the products it buys and sells [causes]  slim margins","The commodity products ADM moves around the world are readily available from competitors [causes]  the company has little pricing power over the products it buys and sells, making for slim margins",False,valid
