In [1]:
import json
import hashlib
import glob
import re

import pandas as pd

In [2]:
def cont(x, y):
    x = x.lower().strip()
    y = y.lower().strip()
    return x in y or y in x


def parse(s):
    matches = re.findall(r"\[Cause\](.*?)\[Relation\](.*?)\[Effect\](.*?)$", s)
    if not matches:
        return '', '', ''
    
    causes, relation, effects = matches[0]
    causes = sorted(x.strip() for x in causes.split("|") if x.strip())
    effects = sorted(x.strip() for x in effects.split("|") if x.strip())
    relation = relation.strip()
    
    return causes, effects, relation


def parsec(col):
    return lambda row: parse(row[col])


def fmt_answer(s):
    causes, effects, relation = parse(s)
    if not causes:
        return s
    
    return f'{causes}\n{relation}\n{effects}'


def read_json(path):
    with open(path) as f:
        j = json.load(f)

    label = pd.DataFrame.from_records(j['label_ids'])
    pred = pd.DataFrame.from_records(j['predictions'])
    join = pd.merge(label, pred, on='id')
    
    df = join[['context', 'answers', 'prediction_text']].rename(columns={
        'answers': 'gold',
        'prediction_text': 'pred',
    })
    return df


def process_df(df):
    new_df = pd.DataFrame()
    new_df[['gold.causes', 'gold.effects', 'gold.relation']] = df.apply(
        parsec('answers'), axis='columns', result_type='expand'
    )
    new_df['context'] = df['context']
    return new_df


def process_other(df):
    for part in ['causes', 'effects']:
        col = f'gold.{part}'
        df[col] = df[col].map(lambda x: ' | '.join(s.lower().strip() for s in x))
    
    def _hash(x):
        return hashlib.sha1(str(x).encode('utf-8')).hexdigest()[:8]
    
    cols = [f'gold.{part}' for part in ['causes', 'effects', 'relation']]
    df['hash'] = df[cols].apply(lambda row: _hash(tuple(row)), axis='columns')
    return df


with open('genqa_joint/test.json') as f:
    j = json.load(f)
df = pd.DataFrame.from_records(j['data'])

other = process_df(df)
other = process_other(other)
other.head()

Unnamed: 0,gold.causes,gold.effects,gold.relation,context,hash
0,"bb&t and suntrust have completed their merger,...",which we believe will drive the next step up i...,enable,"BB&T and SunTrust have completed their merger,...",79460147
1,given tulip's lack of profitability (managemen...,we do not believe the business maintains a cos...,enable,Given Tulip's lack of profitability (managemen...,ec89c529
2,pipeline transportation costs are not tied to ...,tc energy's profitability is not directly tied...,cause,TC Energy's profitability is not directly tied...,594e66df
3,wynn has recently renovated rooms and the gami...,help the facility maintain market share over t...,enable,"That said, Wynn has recently renovated rooms a...",04203351
4,recently generated cash flows and our higher n...,we are keeping our fair value estimate for dan...,cause,We are keeping our fair value estimate for Dan...,5a3ca6d1


In [3]:
other.to_parquet('source.parquet')