In [1]:
import hashlib
import json
import re

import pandas as pd

In [2]:
from IPython.display import display, HTML


pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

def pp(df):
    return display(HTML(df.to_html().replace("\\n","<br>")))

In [3]:
NEW_RELATIONS = {
    '[causes]': 'cause', 
    '[enables]': 'enable', 
    '[prevents]': 'prevent',
}


def cont(x, y):
    x = x.lower().strip()
    y = y.lower().strip()
    return x in y or y in x


def parse(s):
    relation = next(r for r in NEW_RELATIONS.keys() if r in s)
    causes, effects = s.split(relation, maxsplit=1)
    
    causes = sorted(x.strip() for x in causes.split("|") if x.strip())
    effects = sorted(x.strip() for x in effects.split("|") if x.strip())
    relation = NEW_RELATIONS[relation]
    
    return causes, effects, relation


def parsec(col):
    return lambda row: parse(row[col])


def fmt_answer(s):
    causes, effects, relation = parse(s)
    if not causes:
        return s
    
    return f'{causes}\n{relation}\n{effects}'


def read_json(path):
    with open(path) as f:
        j = json.load(f)

    label = pd.DataFrame.from_records(j['label_ids'])
    pred = pd.DataFrame.from_records(j['predictions'])
    join = pd.merge(label, pred, on='id')
    
    df = join[['answers', 'prediction_text']].rename(columns={
        'answers': 'gold',
        'prediction_text': 'pred',
    })
    return df


def process_df(df):
    df['gold'] = df['gold'].map(lambda x: x.split('\n'))
    df['pred'] = df['pred'].map(lambda x: x.split('\n'))
    len_match = df['gold'].map(len) == df['pred'].map(len)
    df = df[len_match].explode(['gold', 'pred'])
    
    new_df = pd.DataFrame()
    new_df[['gold.causes', 'gold.effects', 'gold.relation']] = df.apply(
        parsec('gold'), axis='columns', result_type='expand'
    )
    new_df[['pred.causes', 'pred.effects', 'pred.relation']] = df.apply(
        parsec('pred'), axis='columns', result_type='expand'
    )

    df['gold'] = df['gold'].map(fmt_answer)
    df['pred'] = df['pred'].map(fmt_answer)
    df['correct'] = df['gold'] == df['pred']
    new_df['correct'] = df['gold'] == df['pred']
    
    return df, new_df


def process_other(df):
    for src in ['gold', 'pred']:
        for part in ['causes', 'effects']:
            col = f'{src}.{part}'
            df[col] = df[col].map(lambda x: ' | '.join(s.lower().strip() for s in x))
    
    def _hash(x):
        return hashlib.sha1(str(x).encode('utf-8')).hexdigest()[:8]
    
    cols = [f'gold.{part}' for part in ['causes', 'effects', 'relation']]
    df['hash'] = df[cols].apply(lambda row: _hash(tuple(row)), axis='columns')
    return df


df = read_json('no_prompt/predict_outputs.json')
valid, other = process_df(df)
other = process_other(other)
pp(valid.head())

Unnamed: 0,gold,pred,correct
0,"['BB&T and SunTrust have completed their merger, forming Truist'] enable ['which we believe will drive the next step up in profitability for the franchises']","['BB&T and SunTrust have completed their merger, forming Truist'] enable ['drive the next step up in profitability for the franchises']",False
1,"[""Given Tulip's lack of profitability (management has stated the business was not profitable at the time of the October 2019 acquisition)""] enable ['we do not believe the business maintains a cost advantage']","[""Tulip's lack of profitability""] cause ['we do not believe the business maintains a cost advantage']",False
2,"['pipeline transportation costs are not tied to the price of natural gas and crude oil'] cause [""TC Energy's profitability is not directly tied to commodity prices""]","['pipeline transportation costs are not tied to the price of natural gas and crude oil'] cause [""TC Energy's profitability is not directly tied to commodity prices""]",True
3,['Wynn has recently renovated rooms and the gaming floor space at its peninsula property'] enable ['help the facility maintain market share over the next few years'],['Wynn has recently renovated rooms and the gaming floor space at its peninsula property'] enable ['help the facility maintain market share over the next few years'],True
4,['recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat COVID-19'] cause ['We are keeping our fair value estimate for Danaher at $160 per share after raising it in January 2021'],['recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat COVID-19'] cause ['We are keeping our fair value estimate for Danaher at $160 per share after raising it in January 2021'],True


In [4]:
pp(other.head())

Unnamed: 0,gold.causes,gold.effects,gold.relation,pred.causes,pred.effects,pred.relation,correct,hash
0,"bb&t and suntrust have completed their merger, forming truist",which we believe will drive the next step up in profitability for the franchises,enable,"bb&t and suntrust have completed their merger, forming truist",drive the next step up in profitability for the franchises,enable,False,79460147
1,given tulip's lack of profitability (management has stated the business was not profitable at the time of the october 2019 acquisition),we do not believe the business maintains a cost advantage,enable,tulip's lack of profitability,we do not believe the business maintains a cost advantage,cause,False,ec89c529
2,pipeline transportation costs are not tied to the price of natural gas and crude oil,tc energy's profitability is not directly tied to commodity prices,cause,pipeline transportation costs are not tied to the price of natural gas and crude oil,tc energy's profitability is not directly tied to commodity prices,cause,True,594e66df
3,wynn has recently renovated rooms and the gaming floor space at its peninsula property,help the facility maintain market share over the next few years,enable,wynn has recently renovated rooms and the gaming floor space at its peninsula property,help the facility maintain market share over the next few years,enable,True,04203351
4,recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat covid-19,we are keeping our fair value estimate for danaher at $160 per share after raising it in january 2021,cause,recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat covid-19,we are keeping our fair value estimate for danaher at $160 per share after raising it in january 2021,cause,True,5a3ca6d1


In [5]:
other

Unnamed: 0,gold.causes,gold.effects,gold.relation,pred.causes,pred.effects,pred.relation,correct,hash
0,"bb&t and suntrust have completed their merger, forming truist",which we believe will drive the next step up in profitability for the franchises,enable,"bb&t and suntrust have completed their merger, forming truist",drive the next step up in profitability for the franchises,enable,False,79460147
1,given tulip's lack of profitability (management has stated the business was not profitable at the time of the october 2019 acquisition),we do not believe the business maintains a cost advantage,enable,tulip's lack of profitability,we do not believe the business maintains a cost advantage,cause,False,ec89c529
2,pipeline transportation costs are not tied to the price of natural gas and crude oil,tc energy's profitability is not directly tied to commodity prices,cause,pipeline transportation costs are not tied to the price of natural gas and crude oil,tc energy's profitability is not directly tied to commodity prices,cause,True,594e66df
3,wynn has recently renovated rooms and the gaming floor space at its peninsula property,help the facility maintain market share over the next few years,enable,wynn has recently renovated rooms and the gaming floor space at its peninsula property,help the facility maintain market share over the next few years,enable,True,04203351
4,recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat covid-19,we are keeping our fair value estimate for danaher at $160 per share after raising it in january 2021,cause,recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat covid-19,we are keeping our fair value estimate for danaher at $160 per share after raising it in january 2021,cause,True,5a3ca6d1
...,...,...,...,...,...,...,...,...
2142,the pressure of fee compression and industry competition,weigh on results in the long run,cause,the pressure of fee compression and industry competition,results in the long run,prevent,False,1d38ecf7
2143,the customer has to bring the vehicle to each shop to get a quote,"the vehicle, and comparison-shopping for repair work is very time-consuming",cause,the customer has to bring the vehicle to each shop to get a quote,comparison-shopping for repair work is very time-consuming,cause,False,b15c153b
2144,large global cros,"produce the best trial designs and enroll patients into trials quickly and efficiently, which allows drugs to reach the market faster",enable,"large global cros, such as syneos, have developed regulatory expertise and proprietary data","help them produce the best trial designs and enroll patients into trials quickly and efficiently, which allows drugs to reach the market faster",enable,False,2a434bed
2146,hard-pressed motorists delay new vehicle purchases and instead look to keep their current cars and trucks running,the automotive segment carries less risk,cause,hard-pressed motorists delay new vehicle purchases and instead look to keep their current cars and trucks running,the automotive segment carries less risk,cause,True,c7f656cc


In [6]:
other.to_parquet('noprompt.parquet')