In [1]:
import hashlib
import json
import re

import pandas as pd

In [2]:
from IPython.display import HTML, display

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("max_colwidth", None)


def pp(df):
    return display(HTML(df.to_html().replace("\\n", "<br>")))

In [3]:
def parse(s):
    matches = re.findall(r"\[Cause\](.*?)\[Relation\](.*?)\[Effect\](.*?)$", s)
    if not matches:
        return "", "", ""

    causes, relation, effects = matches[0]
    causes = sorted(x.strip() for x in causes.split("|") if x.strip())
    effects = sorted(x.strip() for x in effects.split("|") if x.strip())
    relation = relation.strip()

    return causes, effects, relation


def parsec(col):
    return lambda row: parse(row[col])


def fmt_answer(s):
    causes, effects, relation = parse(s)
    if not causes:
        return s

    return f"{causes}\n{relation}\n{effects}"


def read_json(path):
    with open(path) as f:
        j = json.load(f)

    label = pd.DataFrame.from_records(j["label_ids"])
    pred = pd.DataFrame.from_records(j["predictions"])
    join = pd.merge(label, pred, on="id")

    df = join[["answers", "prediction_text"]].rename(
        columns={
            "answers": "gold",
            "prediction_text": "pred",
        }
    )
    return df


def process_df(df):
    new_df = pd.DataFrame()
    new_df[["gold.causes", "gold.effects", "gold.relation"]] = df.apply(
        parsec("gold"), axis="columns", result_type="expand"
    )
    new_df[["pred.causes", "pred.effects", "pred.relation"]] = df.apply(
        parsec("pred"), axis="columns", result_type="expand"
    )
    df["gold"] = df["gold"].map(fmt_answer)
    df["pred"] = df["pred"].map(fmt_answer)
    df["correct"] = df["gold"] == df["pred"]
    new_df["correct"] = df["gold"] == df["pred"]

    return df, new_df


def process_other(df):
    for src in ["gold", "pred"]:
        for part in ["causes", "effects"]:
            col = f"{src}.{part}"
            df[col] = df[col].map(lambda x: " | ".join(s.lower().strip() for s in x))

    def _hash(x):
        return hashlib.sha1(str(x).encode("utf-8")).hexdigest()[:8]

    cols = [f"gold.{part}" for part in ["causes", "effects", "relation"]]
    df["hash"] = df[cols].apply(lambda row: _hash(tuple(row)), axis="columns")
    return df


df = read_json("original_tags/predict_outputs.json")
valid, other = process_df(df)
other = process_other(other)
pp(valid.head())

Unnamed: 0,gold,pred,correct
0,"['BB&T and SunTrust have completed their merger, forming Truist'] enable ['which we believe will drive the next step up in profitability for the franchises']","['BB&T and SunTrust have completed their merger, forming Truist'] enable ['drive the next step up in profitability for the franchises']",False
1,"[""Given Tulip's lack of profitability (management has stated the business was not profitable at the time of the October 2019 acquisition)""] enable ['we do not believe the business maintains a cost advantage']","[""Tulip's lack of profitability""] cause ['we do not believe the business maintains a cost advantage']",False
2,"['pipeline transportation costs are not tied to the price of natural gas and crude oil'] cause [""TC Energy's profitability is not directly tied to commodity prices""]","['pipeline transportation costs are not tied to the price of natural gas and crude oil'] cause [""TC Energy's profitability is not directly tied to commodity prices""]",True
3,['Wynn has recently renovated rooms and the gaming floor space at its peninsula property'] enable ['help the facility maintain market share over the next few years'],['Wynn has recently renovated rooms and the gaming floor space at its peninsula property'] enable ['help the facility maintain market share over the next few years'],True
4,['recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat COVID-19'] cause ['We are keeping our fair value estimate for Danaher at $160 per share after raising it in January 2021'],['recent generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat COVID-19'] cause ['We are keeping our fair value estimate for Danaher at $160 per share after raising it in January 2021'],False


In [4]:
pp(valid[~valid["correct"]].head())

Unnamed: 0,gold,pred,correct
0,"['BB&T and SunTrust have completed their merger, forming Truist'] enable ['which we believe will drive the next step up in profitability for the franchises']","['BB&T and SunTrust have completed their merger, forming Truist'] enable ['drive the next step up in profitability for the franchises']",False
1,"[""Given Tulip's lack of profitability (management has stated the business was not profitable at the time of the October 2019 acquisition)""] enable ['we do not believe the business maintains a cost advantage']","[""Tulip's lack of profitability""] cause ['we do not believe the business maintains a cost advantage']",False
4,['recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat COVID-19'] cause ['We are keeping our fair value estimate for Danaher at $160 per share after raising it in January 2021'],['recent generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat COVID-19'] cause ['We are keeping our fair value estimate for Danaher at $160 per share after raising it in January 2021'],False
5,"['management will continue to pursue acquisition targets with high levels of recurring revenue'] prevent [""the cyclicality of Fortive's portfolio""]","['management will continue to pursue acquisition targets with high levels of recurring revenue'] cause [""further lower the cyclicality of Fortive's portfolio""]",False
6,"[""Beyond's products emit 90% less greenhouse gases, require 93% less land, 99% less water, and 46% less energy to produce""] cause ['the 20% of consumers willing to adjust their habits to benefit the environment']","[""Beyond's products emit 90% less greenhouse gases, require 93% less land, 99% less water, and 46% less energy to produce than their meat equivalents""] cause ['a primary growth driver to be the 20% of consumers willing to adjust their habits to benefit the environment']",False


In [5]:
other

Unnamed: 0,gold.causes,gold.effects,gold.relation,pred.causes,pred.effects,pred.relation,correct,hash
0,"bb&t and suntrust have completed their merger, forming truist",which we believe will drive the next step up in profitability for the franchises,enable,"bb&t and suntrust have completed their merger, forming truist",drive the next step up in profitability for the franchises,enable,False,79460147
1,given tulip's lack of profitability (management has stated the business was not profitable at the time of the october 2019 acquisition),we do not believe the business maintains a cost advantage,enable,tulip's lack of profitability,we do not believe the business maintains a cost advantage,cause,False,ec89c529
2,pipeline transportation costs are not tied to the price of natural gas and crude oil,tc energy's profitability is not directly tied to commodity prices,cause,pipeline transportation costs are not tied to the price of natural gas and crude oil,tc energy's profitability is not directly tied to commodity prices,cause,True,594e66df
3,wynn has recently renovated rooms and the gaming floor space at its peninsula property,help the facility maintain market share over the next few years,enable,wynn has recently renovated rooms and the gaming floor space at its peninsula property,help the facility maintain market share over the next few years,enable,True,04203351
4,recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat covid-19,we are keeping our fair value estimate for danaher at $160 per share after raising it in january 2021,cause,recent generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat covid-19,we are keeping our fair value estimate for danaher at $160 per share after raising it in january 2021,cause,False,5a3ca6d1
...,...,...,...,...,...,...,...,...
2428,large global cros,"produce the best trial designs and enroll patients into trials quickly and efficiently, which allows drugs to reach the market faster",enable,"large global cros, such as syneos, have developed regulatory expertise and proprietary data",drugs to reach the market faster,enable,False,2a434bed
2429,the covid-19 outbreak forcing some classes online,continued but moderating declines in publishing,cause,the covid-19 outbreak forcing some classes online,moderating declines in publishing,cause,False,ff66f0eb
2430,continued but moderating declines in publishing,"offsetting growth in courseware, corporate learning, test preparation, and other more advantaged businesses",cause,moderating declines in publishing,"growth in courseware, corporate learning, test preparation, and other more advantaged businesses",prevent,False,0d3e65c4
2431,hard-pressed motorists delay new vehicle purchases and instead look to keep their current cars and trucks running,the automotive segment carries less risk,cause,hard-pressed motorists delay new vehicle purchases and instead look to keep their current cars and trucks running,the automotive segment carries less risk,cause,True,c7f656cc


In [6]:
other.to_parquet("original.parquet")

In [7]:
other.columns

Index(['gold.causes', 'gold.effects', 'gold.relation', 'pred.causes',
       'pred.effects', 'pred.relation', 'correct', 'hash'],
      dtype='object')

In [8]:
def cont(x, y):
    xs = [i.lower().strip() for i in x.split("|") if i.strip()]
    ys = [i.lower().strip() for i in y.split("|") if i.strip()]

    for x in xs:
        for y in ys:
            if x in y or y in x:
                return True

    return False


def _cont(part):
    def f(row):
        gold = f"gold.{part}"
        pred = f"pred.{part}"
        return cont(row[gold], row[pred])

    return f


for part in ["causes", "effects"]:
    other[f"{part}_cont"] = other.apply(_cont(part), axis="columns")

other["cont"] = other["causes_cont"] & other["effects_cont"]
other.head()

Unnamed: 0,gold.causes,gold.effects,gold.relation,pred.causes,pred.effects,pred.relation,correct,hash,causes_cont,effects_cont,cont
0,"bb&t and suntrust have completed their merger, forming truist",which we believe will drive the next step up in profitability for the franchises,enable,"bb&t and suntrust have completed their merger, forming truist",drive the next step up in profitability for the franchises,enable,False,79460147,True,True,True
1,given tulip's lack of profitability (management has stated the business was not profitable at the time of the october 2019 acquisition),we do not believe the business maintains a cost advantage,enable,tulip's lack of profitability,we do not believe the business maintains a cost advantage,cause,False,ec89c529,True,True,True
2,pipeline transportation costs are not tied to the price of natural gas and crude oil,tc energy's profitability is not directly tied to commodity prices,cause,pipeline transportation costs are not tied to the price of natural gas and crude oil,tc energy's profitability is not directly tied to commodity prices,cause,True,594e66df,True,True,True
3,wynn has recently renovated rooms and the gaming floor space at its peninsula property,help the facility maintain market share over the next few years,enable,wynn has recently renovated rooms and the gaming floor space at its peninsula property,help the facility maintain market share over the next few years,enable,True,04203351,True,True,True
4,recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat covid-19,we are keeping our fair value estimate for danaher at $160 per share after raising it in january 2021,cause,recent generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat covid-19,we are keeping our fair value estimate for danaher at $160 per share after raising it in january 2021,cause,False,5a3ca6d1,False,True,False


In [9]:
other.groupby(["causes_cont", "effects_cont"]).size()

causes_cont  effects_cont
False        False            223
             True             189
True         False            191
             True            1830
dtype: int64

In [10]:
other["cont"].value_counts(), other["cont"].value_counts(normalize=True)

(True     1830
 False     603
 Name: cont, dtype: int64,
 True     0.752158
 False    0.247842
 Name: cont, dtype: float64)

In [11]:
print(other["correct"].value_counts())

wrong = other.query("not correct")
pd.concat(
    (wrong["cont"].value_counts(), wrong["cont"].value_counts(normalize=True)), axis=1
)

False    1771
True      662
Name: correct, dtype: int64


Unnamed: 0,cont,cont.1
True,1168,0.659514
False,603,0.340486


In [12]:
pp(other[~other["causes_cont"]].head(20))

Unnamed: 0,gold.causes,gold.effects,gold.relation,pred.causes,pred.effects,pred.relation,correct,hash,causes_cont,effects_cont,cont
4,recently generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat covid-19,we are keeping our fair value estimate for danaher at $160 per share after raising it in january 2021,cause,recent generated cash flows and our higher near-term outlook primarily for its tools used to diagnose and develop products to treat covid-19,we are keeping our fair value estimate for danaher at $160 per share after raising it in january 2021,cause,False,5a3ca6d1,False,True,False
17,low crime statistics see higher demand,higher rents and higher growth,cause,provide food and entertainment options,higher demand,cause,False,0eaf9154,False,False,False
40,others have exited,the number of players falling from 10 to three,cause,an inability to meet the rising investments to keep pace with higher-capacity hdds and the shift to ssds,others have exited,cause,False,aa906242,False,False,False
53,the concentration of high-quality tenants across simon's portfolio,provide an experience for consumers that is hard for other retailers to replicate,cause,the coronavirus,significant uncertainty about the future of sales growth,cause,False,86564276,False,False,False
59,defective equipment or components from abb's automation or electrical portfolio,product recalls,cause,"the product portfolio is diverse, and we believe the company has internal controls to minimize recall risk",defective equipment or components from abb's automation or electrical portfolio could lead to product recalls,cause,False,e3aad36b,False,True,False
83,without the contracts in place,cheniere would struggle,cause,north american gas is not some of the cheapest gas on the global cost curve,"without the contracts in place, cheniere would struggle",cause,False,7de27477,False,True,False
94,"an attended rpa product, monitors agents' desktop activities, identifies repetitive processes which can be automated,",they can focus on the most important activity- servicing the customer.,enable,frees up agents from routine tasks,they can focus on the most important activity- servicing the customer,enable,False,56d0c80e,False,True,False
103,part of the workforce is covered by collective agreements,result in work stoppages if work agreements cannot be reached,cause,work agreements cannot be reached,work stoppages,cause,False,ca93b01f,False,True,False
133,"these involve a system of motors, sensors, and microprocessors that power equipment and simultaneously feed back key measurements into abb's control software","at their core, they both run machines and ensure efficiencies, as well as other operational performance metrics, through control systems",enable,"at their core, they both run machines and ensure efficiencies, as well as other operational performance metrics, through control systems","involve a system of motors, sensors, and microprocessors that power equipment and simultaneously feed back key measurements into abb's control software",cause,False,c2c49a9b,False,False,False
141,wechat or mobile qq become the go-to platforms,"they possess a sizable internet content & informationamount of miniprograms, existing miniprogram developers also benefit from the overall increase in traffic",cause,they possess a sizable internet content & informationamount of miniprograms,existing miniprogram developers also benefit from the overall increase in traffic,cause,False,9ae5d2b8,False,True,False


In [13]:
other.columns

Index(['gold.causes', 'gold.effects', 'gold.relation', 'pred.causes',
       'pred.effects', 'pred.relation', 'correct', 'hash', 'causes_cont',
       'effects_cont', 'cont'],
      dtype='object')