## Data Exploration and Preprocessing

In [47]:
import pandas as pd
import re
from pathlib import Path

In [48]:
# config
BASE_DIR = Path.cwd().parent
DATA_DIR = BASE_DIR / "data"

df = pd.read_parquet(DATA_DIR / "transactions.parquet")

In [49]:
df.head()

Unnamed: 0,transaction_description,category,country,currency
0,Wage,Income,USA,USD
1,Arby's (Contactless),Food & Dining,AUSTRALIA,AUD
2,Occupational Therapy,Healthcare & Medical,USA,USD
3,Potbelly Store Branch,Food & Dining,UK,GBP
4,Amazon - AUSTRALIA,Shopping & Retail,AUSTRALIA,AUD


In [50]:
print(df.shape) # rows, cols
print(df.dtypes) # datatypes
print(df.isnull().sum()) # null values

(4501043, 4)
transaction_description    object
category                   object
country                    object
currency                   object
dtype: object
transaction_description    0
category                   0
country                    0
currency                   0
dtype: int64


In [51]:
df['category'].value_counts()

category
Utilities & Services          451842
Government & Legal            451108
Financial Services            450959
Income                        450545
Charity & Donations           450133
Shopping & Retail             449941
Healthcare & Medical          449857
Entertainment & Recreation    449495
Transportation                449235
Food & Dining                 447928
Name: count, dtype: int64

In [52]:
# using just the 'description' and 'category' columns for categorization
df2 = df[['transaction_description', 'category']].copy()
df2.head()

Unnamed: 0,transaction_description,category
0,Wage,Income
1,Arby's (Contactless),Food & Dining
2,Occupational Therapy,Healthcare & Medical
3,Potbelly Store Branch,Food & Dining
4,Amazon - AUSTRALIA,Shopping & Retail


In [None]:
def clean_description(d):
    d = d.lower() # lowercase everything
    d = re.sub(r'[^a-z\s]', '', d) # remove non-alphabetic characters/ spaces
    d = re.sub(r'\s+', ' ', d).strip() # remove extra spaces
    return d

In [54]:
# apply cleaning function
df2['description'] = df2['transaction_description'].apply(clean_description)
df2.drop(columns= ['transaction_description'], inplace=True)
df2 = df2[[c for c in df2.columns if c != "category"] + ["category"]]

df2.head()

Unnamed: 0,description,category
0,wage,Income
1,arbys contactless,Food & Dining
2,occupational therapy,Healthcare & Medical
3,potbelly store branch,Food & Dining
4,amazon australia,Shopping & Retail


In [55]:
df2.to_parquet(DATA_DIR / "preprocessed_transactions.parquet", index=False)

In [57]:
df2[df2.category != 'Income'].count()

description    4050498
category       4050498
dtype: int64

In [58]:
df2.count()

description    4501043
category       4501043
dtype: int64