In [1]:
import pandas as pd

def load(p: str) -> pd.DataFrame:
    df = pd.read_csv(p)
    df['transaction_reference_id'] = df['transaction_reference_id'].apply(lambda x: int(x, 16))
    return df

account_booking_test_ = load("account_booking_test.csv")
account_booking_train_ = load("account_booking_train.csv")
external_parties_test_ = load("external_parties_test.csv")
external_parties_train_ = load("external_parties_train.csv")

In [2]:
def clean_data(line):
    return line.lower() \
        .replace('mr', '') \
        .replace('ms', '') \
        .replace('mrs', '') \
        .replace('miss ', '') \
        .replace('dr', '') \
        .replace('prof', '') \
        .replace('rev', '') \
        .replace('hon', '') \
        .replace('.', '')

def cleanup(df: pd.DataFrame):
    df["parsed_name"] = df["parsed_name"].apply(clean_data)

cleanup(external_parties_train_)
cleanup(external_parties_test_)

In [3]:
# if training d
if False:
    account_booking = account_booking_train_
    external_parties = external_parties_train_
else:
    account_booking = account_booking_test_
    external_parties = external_parties_test_

In [4]:
def remove_internal(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop_duplicates(subset=["transaction_reference_id"], keep=False)

account_booking = remove_internal(account_booking)

In [5]:
assert account_booking.dtypes['transaction_reference_id'] == external_parties.dtypes['transaction_reference_id']

In [6]:
joined = external_parties.set_index('transaction_reference_id').join(
    account_booking.set_index('transaction_reference_id'),
    on='transaction_reference_id'
).reset_index()

In [7]:
assert len(joined[pd.isna(joined['parsed_name'])]) == 0

In [117]:
from simhash import Simhash
from datetime import date
import pickle
from math import isnan

d_name = {}
d_city = {}

with open("d_name.pkl" , 'rb') as f:
    d_name = pickle.load(f)
with open("d_city.pkl" , 'rb') as f:
    d_city = pickle.load(f)

def bucket_iban(df: pd.DataFrame) -> int | None:
    return df['party_iban']

def bucket_name(df: pd.DataFrame) -> int | None:
    # parsed_name = df['parsed_name']
    # return Simhash(parsed_name).value
    d_name[df['transaction_reference_id']]
    return d_name[df['transaction_reference_id']]

def bucket_city(df: pd.DataFrame) -> int | None:
    return df['parsed_address_city']

def bucket_postal_code(df: pd.DataFrame) -> int | None:
    return df['parsed_address_postal_code']

def bucket_date(df: pd.DataFrame) -> int | None:
    return date.fromisoformat(df['transaction_date']).day

def bucket_name_street(df: pd.DataFrame) -> int | None:
    if has_field(df, 'parsed_address_street_name'):
        parsed_name = df['parsed_name']
        street = df['parsed_address_street_name']
        street = 0 if (type(street) == float and isnan(street)) else str(street)
        return f"{Simhash(parsed_name).value}-{street}"
    else:
        return f"{df['transaction_reference_id']}"
    
def has_field(df: pd.DataFrame, field: str) -> bool:
    # P => Q == not P or Q
    return field in df and (not type(df[field]) != str or not isnan(df[field]))

def bucket_address(df: pd.DataFrame) -> int | None:
    if has_field(df, 'parsed_address_street_name') and has_field(df, 'parsed_address_city') and has_field(df, 'parsed_address_postal_code'):
        street = df['parsed_address_street_name']
        city = df['parsed_address_city']
        code = df['parsed_address_postal_code']
        street = 0 if (type(street) == float and isnan(street)) else str(street)
        city = 0 if (type(city) == float and isnan(city)) else str(city)
        code = 0 if (type(code) == float and isnan(code)) else str(code)
        return f"{Simhash(street).value}-{city}-{code}"
    else:
        return f"{df['transaction_reference_id']}"

In [158]:
from typing import List, Any, Dict
from datetime import datetime

def add_linear_id(start_idx: int, df: pd.DataFrame) -> pd.DataFrame:
    df['external_id'] = range(start_idx, start_idx + len(df))
    return df

# runs the provided functions on the dataframe and progressively groups using a function
# at a time. Returns a dataframe where all rows have been grouped with an associated ID,
# and a dataframe with still ungrouped rows.
def bucket_by_fn(df: pd.DataFrame, fns: List[Any]) -> Dict[int, int]:
    d = []
    already_grouped = set()
    for fn in fns:
        start = datetime.now()
        removed_this_iter = set()
        df['bucket'] = df.apply(fn, axis=1)
        grouped = df.groupby('bucket')
        for _, group in grouped:
            if len(group) < 1:
                continue

            ids = group['transaction_reference_id']
            if len(ids) > 1 and all(id not in already_grouped for id in group):
                s = set(ids)
                d.append(s)
                removed_this_iter = removed_this_iter.union(s)
                already_grouped = already_grouped.union(s)

        print(f"fn {fn} applied, took", (datetime.now() - start))
        df = df[~df['transaction_reference_id'].isin(removed_this_iter)].drop('bucket', axis=1)
    i = 0
    result = {}

    for elems in d:
        assert len(elems) > 1

        for elem in elems:
            result[f"{elem:0{32}x}"] = i
        i += 1

    return result

In [159]:
def do_thing(booking: pd.DataFrame, external: pd.DataFrame) -> pd.DataFrame:
    assert len(external[pd.isna(external['parsed_name'])]) == 0

    joined = external.set_index('transaction_reference_id').join(
        booking.set_index('transaction_reference_id'),
        on='transaction_reference_id'
    ).reset_index()
    print('join finished, working on', len(joined), 'elements')
    assert len(joined[pd.isna(joined['parsed_name'])]) == 0

    final = bucket_by_fn(joined, [bucket_iban, bucket_address, bucket_name])
    print(f"found {len(final)} tuples")
    return final

In [160]:
# result = do_thing(account_booking_test_, external_parties_test_)
result = do_thing(account_booking_train_, external_parties_train_)
df = pd.DataFrame(list(result.items()), columns=['transaction_reference_id', 'external_id'])
df.to_csv('submission.csv', index=False)

join finished, working on 11064 elements
fn <function bucket_iban at 0x7ff7b1508720> applied, took 0:00:00.309510
fn <function bucket_address at 0x7ff86c0dab60> applied, took 0:00:00.738900
fn <function bucket_name at 0x7ff7a9673c40> applied, took 0:00:00.519627
found 6615 tuples


In [138]:
def compare(inp: pd.DataFrame, truth: pd.DataFrame):
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    
    truth_map = {}
    guess_map = {}
    for _, x in truth.iterrows():
        truth_map[x["transaction_reference_id"]] = x["external_id"]
    
    for _, x in inp.iterrows():
        guess_map[x["transaction_reference_id"]] = x["external_id"]

    inp = inp[inp.duplicated(subset='external_id', keep=False)]
    truth = truth[truth.duplicated(subset='external_id', keep=False)]

    truth_gr = truth.groupby("external_id")["transaction_reference_id"].apply(list)
    guess_gr = inp.groupby("external_id")["transaction_reference_id"].apply(list)

    for t_group in truth_gr:
        for a in t_group:
            for b in t_group:
                if a == b:
                    continue

                ag, bg = guess_map.get(a), guess_map.get(b)

                if (ag and bg) and ag == bg:
                    tp += 1
                else:
                    fn += 1

    for g_group in guess_gr:
        for a in g_group:
            for b in g_group:
                if a == b:
                    continue
                
                at, bt = truth_map[a], truth_map[b]

                if at != bt:
                    fp += 1
                else:
                    tn += 1
    
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

removed_singleton = external_parties_train_[external_parties_train_.duplicated(subset='external_id', keep=False)]
print("Number of entities: ", len(df), "/", len(removed_singleton))
print("Number of groups: ", len(df.groupby('external_id').count()), "/", len(removed_singleton.groupby('external_id').count()))
# result = do_thing(account_booking_train_, external_parties_train_)
df['transaction_reference_id'] = df['transaction_reference_id'].apply(lambda x: int(x, 16))
print(compare(df, external_parties_train_))

Number of entities:  7091 / 8093
Number of groups:  2351 / 2029
0.7881309628882445


In [122]:
groups = df.set_index('transaction_reference_id').join(
        external_parties_train_.drop('external_id', axis=1).set_index('transaction_reference_id'),
        on='transaction_reference_id'
    ).reset_index().groupby('external_id')

g = pd.DataFrame()
for i, grp in groups:
    g = pd.concat([g, grp])
    if i > 4:
        break

g

Unnamed: 0,transaction_reference_id,external_id,party_role,party_info_unstructured,parsed_name,parsed_address_street_name,parsed_address_street_number,parsed_address_unit,parsed_address_postal_code,parsed_address_city,parsed_address_state,parsed_address_country,party_iban,party_phone
0,239128624715316126287324147774365357042,0,BENE,michelle burch 2525 julie cape n. chelseafurt ...,michelle burch,julie cape n. chelseafurt,2525,,79579,johndonmouth gibraltar,,,GB79UFZV17351069335041,+41 (514)5581455x05-855
1,50884749122045959552496753754684880211,0,ORG,michelle burch 2525 julie cape n. chelseafurt ...,michelle burch,julie cape n. chelseafurt,2525,,79579,johndonmouth gibraltar,,,GB19IRVO99827868383931,
2,217124689720374996951356866018687555801,0,ORG,michelle burch 2525 julie cape n. chelseafurt ...,michelle burch,julie cape n. chelseafurt,2525,,79579,johndonmouth gibraltar,,,GB27MHPZ53986586620706,0 353-608-0024-2100
3,63843871678862467993587667216022764867,1,ORG,"holland, zuniga and jones 92061 kim turnpike w...","holland, zuniga and jones",kim turnpike,92061,,74365,west pamelabury,,palestinian territory,GB25HIOA50088243322751,
4,309265492670470118033736339696703087598,1,ORG,"holland, zuniga and jones 92061 kim turnpike w...","holland, zuniga and jones",kim turnpike,92061,,74365,west pamelabury,,palestinian territory,GB25HIOA50088243322751,0+1-414-6801879607
5,1086659979692840615414056416039296093,1,ORG,zuniga end jones 92061 kim turnpike west pamel...,zuniga end jones,kim turnpike,92061,,74365,west pamelabury,,palestinian territory,GB25HIOA50088243322751,
6,185717889141358890358804128704512049273,2,ORG,mrs. kristina carlson ii 1784 jennifer hollow ...,s kristina carlson ii,jennifer hollow suite,1784 165w,,38120,est alison,,canadaa,GB80CVJT96682777926856,0041+1-678-708-8004x81372
7,125671815364839300457758667985796400674,2,ORG,mrs. kristina carlson ii 1784 jennifer hollow ...,s kristina carlson ii,jennifer hollow suite,1784 165w,,38120,est alison,,canadaa,GB03IUOB48794915212975,+41 6014298917
8,98037146238155606319775836233599114106,2,ORG,mrs. kristina carlson ii 1784 jennifer hollow ...,s kristina carlson ii,jennifer hollow suite,1784 165w,,38120,est alison,,canadaa,,
9,151619159838607853128736904304395623704,2,ORG,mrs. kristina carlson ii 1784 jennifer hollow ...,s kristina carlson ii,jennifer hollow suite,1784 165w,,38120,est alison,,canadaa,GB19FANM37314710159940,(+4 1) 001363-310- 9 545x30684


In [None]:
result = do_thing(account_booking_train_, external_parties_train_)
df = pd.DataFrame(list(result.items()), columns=['transaction_reference_id', 'external_id'])
df.to_csv('submission.csv', index=False)

join finished, working on 11064 elements
fn <function bucket_iban at 0x7ff7b1508040> applied
found 3872 tuples


In [None]:
booking = pd.DataFrame(account_booking_train_).drop_duplicates(subset=["transaction_reference_id"], keep=False)
parties = external_parties_train_[external_parties_train_.duplicated(subset="external_id", keep=False)]
data = booking.merge(parties, on="transaction_reference_id", how="inner").groupby('external_id').agg(list).reset_index()
print(data.iloc[0])

external_id                                                              20000042
transaction_reference_id        [313980040502377422398921352947482856636, 1670...
debit_credit_indicator                                   [CREDIT, CREDIT, CREDIT]
account_id                                                  [28523, 25210, 21675]
transaction_amount                                    [8101.48, 5879.79, 4590.15]
transaction_currency                                              [GBP, GBP, GBP]
transaction_date                             [2023-07-10, 2023-06-02, 2024-10-24]
party_role                                                        [ORG, ORG, ORG]
party_info_unstructured         [m. solomon 08/04/2000 826 gwendolyn plaza apt...
parsed_name                         [m solomon, marsolomon, mary mary solomon ii]
parsed_address_street_name      [gwendolyn plaza, gwendolyn plaza apt., gwendo...
parsed_address_street_number                                      [826, 826, 826]
parsed_address_u