In [177]:
import pandas as pd

account_booking_test_ = pd.read_csv("account_booking_test.csv")
account_booking_train_ = pd.read_csv("account_booking_train.csv")
external_parties_test_ = pd.read_csv("external_parties_test.csv")
external_parties_train_ = pd.read_csv("external_parties_train.csv")

In [178]:
# if training
if True:
    account_booking = account_booking_train_
    external_parties = external_parties_train_
else:
    account_booking = account_booking_test_
    external_parties = external_parties_test_

In [179]:
def remove_internal(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop_duplicates(subset=["transaction_reference_id"], keep=False)

account_booking = remove_internal(account_booking)

In [180]:
assert account_booking.dtypes['transaction_reference_id'] == external_parties.dtypes['transaction_reference_id']

In [182]:
joined = account_booking.set_index('transaction_reference_id').join(
    external_parties.set_index('transaction_reference_id'),
    on='transaction_reference_id'
)

In [183]:
from typing import Tuple
from pandas.core.groupby.generic import DataFrameGroupBy

# folds all groups in gr that have length > 1 into base.
# The unfolded elements are ungrouped and a column named `external_id` is added
# to each based on their belonging group. The next group index is returned as the first item.
# All other groups are de-grouped and returned as a dataframe (third argument)
def fold_group(start_idx: int, base: pd.DataFrame, gr: DataFrameGroupBy) -> Tuple[int, pd.DataFrame, pd.DataFrame]:
    unfolded = []  # List to store non-folded (de-grouped) data
    next_idx = start_idx  # Start index for the next group
    
    for group_key, group in gr:
        group = group.drop('bucket', axis=1)
        # If the group has more than 1 element, fold it into `base`
        if len(group) > 1:
            group['external_id'] = next_idx  # Add external_id column
            base = pd.concat([base, group], ignore_index=True)  # Append the group to `base`
            next_idx += 1  # Increment the index for the next group
        else:
            # If group has only one element, de-group it (store it separately)
            unfolded.append(group.iloc[0])

    # Combine all the unfolded groups (single-element groups)
    unfolded_df = pd.DataFrame(unfolded)

    # Return the next group index, updated base dataframe, and de-grouped dataframe
    return next_idx, base, unfolded_df

def add_linear_id(start_idx: int, df: pd.DataFrame) -> pd.DataFrame:
    df['external_id'] = range(start_idx, start_idx + len(df))
    return df

In [187]:
assert len(joined[pd.isna(joined['parsed_name'])]) == 0

In [188]:
from simhash import Simhash
from typing import List, Any, Tuple

def bucket_name(df: pd.DataFrame) -> int | None:
    parsed_name = df['parsed_name']
    return Simhash(parsed_name).value


def bucket_city(df: pd.DataFrame) -> int | None:
    parsed_address_city = df['parsed_address_city']
    city = parsed_address_city if type(parsed_address_city) is str else ""
    return Simhash(city).value

# runs the provided functions on the dataframe and progressively groups using a function
# at a time. Returns a dataframe where all rows have been grouped with an associated ID,
# and a dataframe with still ungrouped rows.
def bucket_by_fn(df: pd.DataFrame, fns: List[Any]) -> Tuple[int, pd.DataFrame, pd.DataFrame]:
    base = pd.DataFrame()
    idx = 0
    for fn in fns:
        df['bucket'] = df.apply(fn, axis=1)
        grouped = df.groupby('bucket')
        new_idx, new_base, new_df = fold_group(idx, base, grouped)

        idx = new_idx
        base = new_base
        df = new_df

    return idx, df, base

idx, grouped, ungrouped = bucket_by_fn(joined, [bucket_name])
new_ungrouped = add_linear_id(idx, ungrouped)
final = pd.concat([grouped, new_ungrouped], ignore_index=True)
assert len(final) == len(joined)
len(final.groupby('external_id').count())

9541

In [190]:
d = pd.DataFrame(joined)
d['bucket'] = d.apply(bucket_name, axis=1)
grouped = d.groupby('bucket')
idx, group, ungrouped = fold_group(0, pd.DataFrame(), grouped)

final = pd.concat([group, add_linear_id(idx, ungrouped)], ignore_index=True)

Unnamed: 0,debit_credit_indicator,account_id,transaction_amount,transaction_currency,transaction_date,party_role,party_info_unstructured,parsed_name,parsed_address_street_name,parsed_address_street_number,parsed_address_unit,parsed_address_postal_code,parsed_address_city,parsed_address_state,parsed_address_country,party_iban,party_phone,external_id
0,CREDIT,21165,8468.4,GBP,2023-02-17,ORG,w. west 487315718 68211 vincent ways suite 057...,w. west,w. west vincent ways suite,487315718,,68211 82285,stevefurt elizabethmouth,,,GB28VAHY84286320318992,,0
1,CREDIT,29034,1505.98,GBP,2024-05-29,ORG,w. west 487315718 68211 vincent ways suite 057...,w. west,w. west vincent ways suite,487315718,,68211 82285,stevefurt elizabethmouth,,,GB77HQHT26255107475021,,0
2,DEBIT,24120,1957.51,GBP,2024-01-30,BENE,m.a. amber 34150 haley cliff north donald pame...,m.a. amber,haley cliff north donald pamela,34150,,,,sao tome and principe,,GB22KLWY62854496883141,,1
3,DEBIT,23783,8358.75,GBP,2023-02-22,BENE,m.a. amber 34150 haley cliff north donald east...,m.a. amber,haley cliff north donald east,34150,,d628976,pamela,sao tome and principe,,,2054413303x6262,1
4,DEBIT,28132,4829.77,GBP,2023-05-24,BENE,m.a. amber 34150 jaley cliff n. donald east pa...,m.a. amber,jaley cliff n. donald east pamela,34150,,,,sao tome and principe,,GB12AJFG47261147109532,0057 593-510-13280442,1
