In [1]:
import pandas as pd

def load(p: str) -> pd.DataFrame:
    df = pd.read_csv(p)
    df['transaction_reference_id'] = df['transaction_reference_id'].apply(lambda x: int(x, 16))
    return df

account_booking_test_ = load("account_booking_test.csv")
account_booking_train_ = load("account_booking_train.csv")
external_parties_test_ = load("external_parties_test.csv")
external_parties_train_ = load("external_parties_train.csv")

In [2]:
# if training d
if True:
    account_booking = account_booking_train_
    external_parties = external_parties_train_
else:
    account_booking = account_booking_test_
    external_parties = external_parties_test_

In [3]:
def remove_internal(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop_duplicates(subset=["transaction_reference_id"], keep=False)

account_booking = remove_internal(account_booking)

In [4]:
assert account_booking.dtypes['transaction_reference_id'] == external_parties.dtypes['transaction_reference_id']

In [5]:
joined = external_parties.set_index('transaction_reference_id').join(
    account_booking.set_index('transaction_reference_id'),
    on='transaction_reference_id'
).reset_index()

In [6]:
assert len(joined[pd.isna(joined['parsed_name'])]) == 0

In [7]:
from simhash import Simhash
import pickle

d_name = {}
d_city = {}

with open("d_name.pkl" , 'rb') as f:
    d_name = pickle.load(f)
with open("d_city.pkl" , 'rb') as f:
    d_city = pickle.load(f)

def bucket_name(df: pd.DataFrame) -> int | None:
    # parsed_name = df['parsed_name']
    # return Simhash(parsed_name).value
    return d_name[df['transaction_reference_id']]

def bucket_city(df: pd.DataFrame) -> int | None:
    # parsed_address_city = df['parsed_address_city']
    # city = parsed_address_city if type(parsed_address_city) is str else ""
    # return Simhash(city).value
    return d_city[df['transaction_reference_id']]

def bucket_iban(df: pd.DataFrame) -> int | None:
    return df['party_iban']

In [8]:
from typing import List, Any, Tuple
from disjoint_set import DisjointSet

def add_linear_id(start_idx: int, df: pd.DataFrame) -> pd.DataFrame:
    df['external_id'] = range(start_idx, start_idx + len(df))
    return df


# runs the provided functions on the dataframe and progressively groups using a function
# at a time. Returns a dataframe where all rows have been grouped with an associated ID,
# and a dataframe with still ungrouped rows.
def bucket_by_fn(df_: pd.DataFrame, fns: List[Any]) -> pd.DataFrame:
    d = DisjointSet()
    for fn in fns:
        df = df_
        df['bucket'] = df.apply(fn, axis=1)
        grouped = df.groupby('bucket')
        for _, group in grouped:
            if len(group) < 1:
                continue

            first_id = group.iloc[0]['transaction_reference_id']
            for i in range(1, len(group)):
                new_id = group.iloc[i]['transaction_reference_id']
                d.union(first_id, new_id)

        print(f"fn {fn} applied")

    df['external_id'] = df.apply(lambda row: int(d.find(row['transaction_reference_id'])), axis=1)
    assert len(df) == len(df_)
    return df

In [9]:
def do_thing(booking: pd.DataFrame, external: pd.DataFrame) -> pd.DataFrame:
    assert len(external[pd.isna(external['parsed_name'])]) == 0

    joined = external.set_index('transaction_reference_id').join(
        booking.set_index('transaction_reference_id'),
        on='transaction_reference_id'
    ).reset_index()
    print('join finished, working on', len(joined), 'elements')
    assert len(joined[pd.isna(joined['parsed_name'])]) == 0

    final = bucket_by_fn(joined, [bucket_iban, bucket_name])
    print(final.head(5))
    return final

In [15]:
result = do_thing(account_booking_test_, external_parties_test_)
# result = do_thing(account_booking_train_, external_parties_train_)
result['transaction_reference_id'] = result['transaction_reference_id'].apply(lambda x: f"{x:x}")
result = result.drop(['debit_credit_indicator', 'account_id', 'transaction_amount', 'transaction_currency', 'transaction_date'], axis=1)
result.to_csv('submission.csv', index=False)

join finished, working on 1481672 elements
fn <function bucket_iban at 0x7fb5d20f98a0> applied
fn <function bucket_name at 0x7fb5d20f96c0> applied
                  transaction_reference_id party_role  \
0  190671140705715423650009635267226994432       BENE   
1  293239573528033791419170337610631075102        ORG   
2   55541625012552974453007531543628117875        ORG   
3   75674421050931036465491628983469479240       BENE   
4  317628195792290486045749025856081283468        ORG   

                             party_info_unstructured  \
0  therese humphrey 6223 john brokos wilvoxside 7...   
1  holly dudley 1151 doug las islands danniellesi...   
2  mcfarlan dknc 74099 joseph inlet ramirezport 7...   
3  jamie cooper 50019 austin light johnbury saudi...   
4  fernandez, frost and gonzalez 6776 thomas cres...   

                     parsed_name     parsed_address_street_name  \
0               therese humphrey                    john brokos   
1                   holly dudley  doug 

In [11]:
def compare(inp: pd.DataFrame, truth: pd.DataFrame):
    assert len(inp) == len(truth)
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    
    truth_map = {}
    guess_map = {}
    for _, x in truth.iterrows():
        truth_map[x["transaction_reference_id"]] = x["external_id"]
    
    for _, x in inp.iterrows():
        guess_map[x["transaction_reference_id"]] = x["external_id"]

    inp = inp[inp.duplicated(subset='external_id', keep=False)]
    truth = truth[truth.duplicated(subset='external_id', keep=False)]

    truth_gr = truth.groupby("external_id")["transaction_reference_id"].apply(list)
    guess_gr = inp.groupby("external_id")["transaction_reference_id"].apply(list)

    
    for t_group in truth_gr:
        for a in t_group:
            for b in t_group:
                if a == b:
                    continue

                ag, bg = guess_map[a], guess_map[b]

                if ag == bg:
                    tp += 1
                else:
                    fn += 1

    for g_group in guess_gr:
        for a in g_group:
            for b in g_group:
                if a == b:
                    continue
                
                at, bt = truth_map[a], guess_map[b]

                if at != bt:
                    fp += 1
                else:
                    tn += 1
    
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1
                
# result = do_thing(account_booking_train_, external_parties_train_)

# print(compare(result, external_parties_train_))