In [48]:
import pandas as pd
import os

stripe_reconciliation = pd.read_csv(os.path.join('reports', 'opensource', 'stripe_reconciliation.csv'))
paypal_reconciliation = pd.read_csv(os.path.join('reports', 'opensource', 'paypal_reconciliation.csv'))
wise_reconciliation = pd.read_csv(os.path.join('reports', 'opensource', 'wise_reconciliation.csv'))

# all platform transactions from all_platform_transactions.csv
all_platform_transactions = pd.read_csv(os.path.join('reports', 'opensource', 'all_platform_transactions.csv'))

  stripe_reconciliation = pd.read_csv(os.path.join('reports', 'opensource', 'stripe_reconciliation.csv'))
  paypal_reconciliation = pd.read_csv(os.path.join('reports', 'opensource', 'paypal_reconciliation.csv'))
  all_platform_transactions = pd.read_csv(os.path.join('reports', 'opensource', 'all_platform_transactions.csv'))


In [49]:
# for stripe_reconciliation, paypal_reconciliation, wise_reconciliation change the column name of _reconciliation_category to stripe._reconciliation_category, paypal._reconciliation_category, wise._reconciliation_category
stripe_reconciliation = stripe_reconciliation.rename(columns={'_reconciliation_category': 'stripe._reconciliation_category'})
paypal_reconciliation = paypal_reconciliation.rename(columns={'_reconciliation_category': 'paypal._reconciliation_category'})
wise_reconciliation = wise_reconciliation.rename(columns={'_reconciliation_category': 'wise._reconciliation_category'})

In [50]:
# join stripe, paypal, and wise reconciliations to all_platform_transactions, do not duplicate rows, drop columns with identical names
all_platform_transactions_merged = all_platform_transactions.merge(stripe_reconciliation, on='id', how='left', suffixes=('', '_stripe'))
all_platform_transactions_merged = all_platform_transactions_merged.merge(paypal_reconciliation, on='id', how='left', suffixes=('', '_paypal'))
all_platform_transactions_merged = all_platform_transactions_merged.merge(wise_reconciliation, on='id', how='left', suffixes=('', '_wise'))

In [51]:
# add a new column called _reconciliation_category that is the concatenation of stripe._reconciliation_category, paypal._reconciliation_category, wise._reconciliation_category
all_platform_transactions_merged['_reconciliation_category'] = all_platform_transactions_merged['stripe._reconciliation_category'].fillna('') + all_platform_transactions_merged['paypal._reconciliation_category'].fillna('') + all_platform_transactions_merged['wise._reconciliation_category'].fillna('')

# drop stripe._reconciliation_category, paypal._reconciliation_category, wise._reconciliation_category
all_platform_transactions_merged = all_platform_transactions_merged.drop(columns=['stripe._reconciliation_category', 'paypal._reconciliation_category', 'wise._reconciliation_category'])

In [52]:
# values count of _reconciliation_category
print(all_platform_transactions_merged['_reconciliation_category'].value_counts())

_reconciliation_category
                                        648373
stripe_platform_contributions           131332
paypal_platform_contributions            58315
wise_platform                             3403
stripe_platform_virtual_card              2180
paypal_platform_expenses                  1607
stripe_platform_contrubution_refunds       432
stripe_platform_disputes                    63
Name: count, dtype: int64


In [53]:
# group by 'group' id
grouped = all_platform_transactions_merged.groupby('group')

# Define a custom function to count unique non-null and non-empty '_reconciliation_category' values
def count_unique_non_null(series):
    # Filter out null and empty strings, then return the length of unique values
    return len(series[series != ''].dropna().unique())

# Apply the custom function to each group for the '_reconciliation_category'
unique_counts = grouped['_reconciliation_category'].agg(count_unique_non_null)

# Classify based on unique counts
reconciled_groups = unique_counts.index[unique_counts == 1].tolist()
ambiguous_groups = unique_counts.index[unique_counts > 1].tolist()
unreconciled_groups = unique_counts.index[unique_counts == 0].tolist()

# Print results
print('all groups:', len(grouped))
print('reconciled_groups:', len(reconciled_groups))
print('ambiguous_groups:', len(ambiguous_groups))
print('unreconciled_groups:', len(unreconciled_groups))


all groups: 202466
reconciled_groups: 196612
ambiguous_groups: 245
unreconciled_groups: 5609


In [54]:
reconciled_transactions = all_platform_transactions_merged[all_platform_transactions_merged['group'].isin(reconciled_groups)]
ambiguous_transactions = all_platform_transactions_merged[all_platform_transactions_merged['group'].isin(ambiguous_groups)]
unreconciled_transactions = all_platform_transactions_merged[all_platform_transactions_merged['group'].isin(unreconciled_groups)]

In [55]:
# Add reconciliation category to all transactions in reconciled groups

# Step 1: Get the first non-empty category for each group
first_non_empty_category = reconciled_transactions.groupby('group')['_reconciliation_category'].apply(
    lambda x: next((item for item in x if item != ''), None)
)

# Step 2: Update '_reconciliation_category' in reconciled_transactions by mapping the groups to their first non-empty category
reconciled_transactions['_reconciliation_category'] = reconciled_transactions['group'].map(first_non_empty_category)

# Step 3: Update '_reconciliation_category' in all_platform_transactions_merged in a similar manner
all_platform_transactions_merged['_reconciliation_category'] = all_platform_transactions_merged['group'].map(first_non_empty_category)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reconciled_transactions['_reconciliation_category'] = reconciled_transactions['group'].map(first_non_empty_category)


In [56]:
# Set the _reconciliation_category for all empty values in ambiguous_transactions to the value from the row in the group with the most recent createdAt date

# Step 1: Sort transactions by 'group' and 'createdAt' to ensure the latest transaction comes first
ambiguous_transactions_sorted = ambiguous_transactions.sort_values(by=['group', 'createdAt'], ascending=[True, False])

# Identify the most recent non-empty '_reconciliation_category' for each group
most_recent_category_by_group = ambiguous_transactions_sorted.groupby('group').apply(
    lambda x: x.loc[x['_reconciliation_category'] != '']['_reconciliation_category'].iloc[0] if any(x['_reconciliation_category'] != '') else None
).dropna()

# Step 2: Update '_reconciliation_category' for transactions with an empty category using the most recent non-empty category found
def update_category(row):
    if row['_reconciliation_category'] == '':
        return most_recent_category_by_group.get(row['group'], '')
    return row['_reconciliation_category']

ambiguous_transactions['_reconciliation_category'] = ambiguous_transactions.apply(update_category, axis=1)

# update all_platform_transactions_merged with id from ambiguous_transactions with the updated _reconciliation_category
all_platform_transactions_merged.update(ambiguous_transactions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ambiguous_transactions['_reconciliation_category'] = ambiguous_transactions.apply(update_category, axis=1)


In [57]:
# values count of _reconciliation_category in ambiguous_transactions
ambiguous_transactions['_reconciliation_category'].value_counts()

_reconciliation_category
stripe_platform_contributions           1477
stripe_platform_contrubution_refunds     426
stripe_platform_disputes                  37
Name: count, dtype: int64

In [58]:
# list all columns in all_platform_transactions_merged that end with _stripe, _paypal or _wise
stripe_duplicate_columns = [col for col in all_platform_transactions_merged.columns if col.endswith('_stripe')]
paypal_duplicate_columns = [col for col in all_platform_transactions_merged.columns if col.endswith('_paypal')]
wise_duplicate_columns = [col for col in all_platform_transactions_merged.columns if col.endswith('_wise')]

In [59]:
# drop all columns from stripe_duplicate_columns, paypal_duplicate_columns, wise_duplicate_columns from all_platform_transactions_merged
all_platform_transactions_merged = all_platform_transactions_merged.drop(columns=stripe_duplicate_columns + paypal_duplicate_columns + wise_duplicate_columns)

In [60]:
# append prefix platform. to all columns that do not start with stripe. or paypal. or wise. or .bank or _reconciliation_category
for col in all_platform_transactions_merged.columns:
    if not col.startswith(('stripe.', 'paypal.', 'wise.', 'bank', '_reconciliation_category')):
        all_platform_transactions_merged.rename(columns={col: 'platform.' + col}, inplace=True)

In [61]:
all_platform_transactions_merged = all_platform_transactions_merged[['_reconciliation_category'] + [col for col in all_platform_transactions_merged.columns if col != '_reconciliation_category']]

In [62]:
keep_columns = ['_reconciliation_category',
'platform.createdAt',
'platform.id',
'platform.merchantId',
'platform.group',
'platform.type',
'platform.kind',
'platform.description',
'platform.isRefunded',
'platform.isRefund',
'platform.isOrderRejected',
'platform.amount.currency',
'platform.amount.value',
'platform.amountInHostCurrency.currency',
'platform.amountInHostCurrency.value',
'platform.netAmount.currency',
'platform.netAmount.value',
'platform.netAmountInHostCurrency.currency',
'platform.netAmountInHostCurrency.value',
'platform.paymentProcessorFee.currency',
'platform.paymentProcessorFee.value',
'platform.account.id',
'platform.account.legacyId',
'platform.account.name',
'platform.account.slug',
'platform.account.type',
'platform.oppositeAccount.id',
'platform.oppositeAccount.legacyId',
'platform.oppositeAccount.name',
'platform.oppositeAccount.slug',
'platform.oppositeAccount.type',
'platform.order.id',
'platform.order.description',
'platform.order.memo',
'platform.paymentMethod.type',
'platform.order',
'platform.expense.id',
'platform.expense.type',
'platform.expense.description',
'platform.expense.invoiceInfo',
'platform.expense.tags',
'platform.expense.virtualCard.id',
'platform.expense.payee.id',
'platform.expense.payee.name',
'platform.expense.payee.slug',
'platform.expense.payee.type',
'platform.refundTransaction.id',
'platform.payoutMethod.type',
'stripe.balance_transaction_id',
'stripe.created_utc',
'stripe.created',
'stripe.available_on_utc',
'stripe.available_on',
'stripe.currency',
'stripe.gross',
'stripe.fee',
'stripe.net',
'stripe.reporting_category',
'stripe.source_id',
'stripe.description',
'stripe.customer_facing_amount',
'stripe.customer_facing_currency',
'stripe.automatic_payout_id',
'stripe.automatic_payout_effective_at_utc',
'stripe.automatic_payout_effective_at',
'stripe.customer_id',
'stripe.customer_description',
'stripe.charge_id',
'stripe.payment_intent_id',
'stripe.charge_created_utc',
'stripe.charge_created',
'stripe.payment_method_type',
'stripe.card_brand',
'stripe.card_funding',
'stripe.card_country',
'stripe.payment_metadata[from]',
'stripe.payment_metadata[to]',
'paypal.Date',
'paypal.Time',
'paypal.TimeZone',
'paypal.Name',
'paypal.Type',
'paypal.Status',
'paypal.Currency',
'paypal.Gross',
'paypal.Fee',
'paypal.Net',
'paypal.From Email Address',
'paypal.To Email Address',
'paypal.Transaction ID',
'paypal.Reference Txn ID',
'paypal.Balance',
'paypal.Contact Phone Number',
'paypal.Subject',
'paypal.Balance Impact',
'paypal.Datetime',
'wise.id',
'wise.status',
'wise.rate',
'wise.created',
'wise.details.reference',
'wise.sourceCurrency',
'wise.sourceValue',
'wise.targetCurrency',
'wise.targetValue']

In [63]:
# drop all columns from all_platform_transactions_merged that are not in keep_columns
all_platform_transactions_merged = all_platform_transactions_merged[keep_columns]

In [64]:
# values count of _reconciliation_category including NaN
print(all_platform_transactions_merged['_reconciliation_category'].value_counts(dropna=False))

_reconciliation_category
stripe_platform_contributions           525361
paypal_platform_contributions           291596
NaN                                      20965
wise_platform                             3403
stripe_platform_virtual_card              2180
paypal_platform_expenses                  1607
stripe_platform_contrubution_refunds       447
stripe_platform_disputes                   146
Name: count, dtype: int64


In [65]:
# getting 3 random samples for each category

samples = {}
categories = [category for category in all_platform_transactions_merged['_reconciliation_category'].dropna().unique()]
for category in categories:
    # get all unique platform.group values for the category
    groups = all_platform_transactions_merged[all_platform_transactions_merged['_reconciliation_category'] == category]['platform.group'].unique()
    # get 10 random values from groups list
    sample_groups = list(pd.Series(groups).sample(5))
    # add a new dataframe to samples dictionary with key as category
    samples[category] = all_platform_transactions_merged[all_platform_transactions_merged['platform.group'].isin(sample_groups)]

In [66]:
# join all in categories into a single dataframe
all_samples = pd.concat(samples.values())

In [67]:
# count number of unique values in the platform.group column
print(f'Unique groups in all samples: {all_samples["platform.group"].nunique()}')

Unique groups in all samples: 35


In [68]:
# get all unique values in the platform.refundTransaction.id column
refund_ids = all_samples['platform.refundTransaction.id'].dropna().unique()

# get all rows from all_platform_transactions_merged where platform.id is in refund_ids
refund_transactions = all_platform_transactions_merged[all_platform_transactions_merged['platform.id'].isin(refund_ids)]

# get all groups from refund_transactions   
refund_groups = refund_transactions['platform.group'].unique()

# get all rows from all_platform_transactions_merged where platform.group is in refund_groups
original_transactions = all_platform_transactions_merged[all_platform_transactions_merged['platform.group'].isin(refund_groups)]

In [69]:
# concatenate original_transactions and all_samples into a single dataframe and drop duplicates
all_transactions = pd.concat([original_transactions, all_samples]).drop_duplicates()

# drop all rows that are duplicates in all_transactions on all columns except for _reconciliation_category
all_transactions = all_transactions.drop_duplicates(subset=all_transactions.columns.difference(['_reconciliation_category']))

# count number of unique values in the platform.group column
print(f'Unique groups in all transactions: {all_transactions["platform.group"].nunique()}')

Unique groups in all transactions: 42


In [71]:
# print all rows in all_samples
pd.set_option('display.max_rows', None)
# sort by platform.group
final_samples = all_transactions.sort_values('platform.group')
# pd show all columns
pd.set_option('display.max_columns', None)