# Clean "memo" Column
1) Clean the memo fields
2) Analyze memo fields

## Requirements

In [1]:
import pandas as pd
pd.set_option('display.float_format', '{:.4f}'.format)
# from sklearn.model_selection import train_test_split

## Read Data

In [2]:
outflow_df = pd.read_parquet('ucsd-outflows.pqt')

print("\nOutflow Data")
display(outflow_df.head())


Outflow Data


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,LOAN,900.6,2022-07-05,LOAN
1,0,acc_0,ATM_CASH,80.0,2022-03-25,ATM_CASH
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES
3,0,acc_0,LOAN,634.0,2023-01-10,LOAN
4,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES


In [3]:
# Extracting the "memo" fields from the dataframe to analyze how to clean them
memo_fields = outflow_df["memo"].unique()

# Displaying a sample of the memo fields to understand the structure of the data
sample_memo_fields = memo_fields[:30]
sample_memo_fields

array(['LOAN', 'ATM_CASH', 'TST* Casa Del Rio - Exp Fairlawn OH 09/24',
       'Buffalo Wild Wings', 'ENTERTAINMENT', 'Oculus CA 04/16',
       'LOS GIRASOLES STOW OH 03/08', 'BUZZIS LAUNDRY 1 OH 03/28',
       'BUZZIS LAUNDRY 1 OH 02/13', 'TGI FRIDAYS XXXX STOW OH 12/31',
       'TST* The Basement Sp Cuyahoga Fall OH 06/06', "Lowe's",
       'PIADA - 39 OH 08/23', 'GrubHub', 'ESSENTIAL_SERVICES',
       'HARDEES XXXXXXX AKRON OH 05/29',
       'MARKET DI XXXX State Cuyahoga Fall OH 04/06',
       'SWENSONS - MONTROSE AKRON OH 06/29', 'Great Clips',
       'EXTERNAL_TRANSFER', 'APPLE.COM/BILL CA 04/07', 'AUTOMOTIVE',
       'APPLE.COM/BILL CA 04/29', 'APPLE.COM/BILL CA 01/28',
       'UNCATEGORIZED', 'LOS GIRASOLES STOW OH 05/24',
       'WING WAREHOUSE CUYAH CUYAHOGA FALL OH 03/29',
       'WINKING LIZARD - 30 MA OH 05/23',
       'LONGHORN STEAK XXXXX CUYAHOGA FALL OH 10/09',
       'ON TAP - CUYAHOGA FA CUYAHOGA FALL OH 12/06'], dtype=object)

In [4]:
# Save the unique memo fields to a text file
with open('memos/unique_memo.txt', 'w') as file:
    for memo in memo_fields:
        file.write(memo + '\n')

# Print the number of unique memo fields
print(f"Total unique memo fields: {len(memo_fields)}")
print("Memo fields have been saved to 'unique_memo.txt'")


Total unique memo fields: 528786
Memo fields have been saved to 'unique_memo.txt'


## Clean memo

In [5]:
import re

def clean_memo(memo):
    # Convert to lowercase
    memo = memo.lower()
    
    # Replace redacted information (e.g., 'xxxx', 'xxxxxxxxxxxxxxx') with 'xxx'
    memo = re.sub(r'xxx+', 'xxx', memo)
    
    # Replace dates (e.g., '09/24') with a space
    memo = re.sub(r'\d{1,2}/\d{1,2}', ' ', memo)
    
    # Replace digits and special characters with a space
    memo = re.sub(r'[^a-z\s]', ' ', memo)
    
    # Replace multiple spaces with a single space
    memo = re.sub(r'\s+', ' ', memo).strip()
    
    return memo

# Apply the cleaning function to the 'memo' column
outflow_df['cleaned_memo'] = outflow_df['memo'].apply(clean_memo)

# Save the cleaned memos to a text file
outflow_df['cleaned_memo'].to_csv('memos/cleaned_memo.txt', index=False, header=False)

print(f"Cleaned {len(outflow_df)} memo fields and saved to 'cleaned_memo.txt'.")

Cleaned 2597488 memo fields and saved to 'cleaned_memo.txt'.


## Comparison Table

In [6]:
# Create a DataFrame with original and cleaned memos side by side
comparison_df = pd.DataFrame({
    'Original Memo': outflow_df['memo'],
    'Cleaned Memo': outflow_df['cleaned_memo']
})

# Optionally, display the first few rows for a quick comparison
print(comparison_df.head(10))

# Save the comparison to a CSV or TXT file if needed
comparison_df.to_csv('memo_comparison.csv', index=False)

                               Original Memo                      Cleaned Memo
0                                       LOAN                              loan
1                                   ATM_CASH                          atm cash
2  TST* Casa Del Rio - Exp Fairlawn OH 09/24  tst casa del rio exp fairlawn oh
3                                       LOAN                              loan
4                         Buffalo Wild Wings                buffalo wild wings
5                              ENTERTAINMENT                     entertainment
6                            Oculus CA 04/16                         oculus ca
7                LOS GIRASOLES STOW OH 03/08             los girasoles stow oh
8                  BUZZIS LAUNDRY 1 OH 03/28                 buzzis laundry oh
9                  BUZZIS LAUNDRY 1 OH 02/13                 buzzis laundry oh
