In [55]:
import pandas as pd
import numpy as np
import re
from datasets import Dataset
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import fasttext
import xgboost as xgb
from catboost import CatBoostClassifier


In [33]:
# Read datasets
outflow = pd.read_parquet("ucsd-outflows.pqt")
inflow = pd.read_parquet("ucsd-inflows.pqt")

In [34]:
# Create a subset for testing
#inflow_subset = inflow.sample(n=125000, random_state=42)
#outflow_subset = outflow.sample(n=125000, random_state=42)
inflow
outflow

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,LOAN,900.60,2022-07-05,LOAN
1,0,acc_0,ATM_CASH,80.00,2022-03-25,ATM_CASH
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES
3,0,acc_0,LOAN,634.00,2023-01-10,LOAN
4,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
...,...,...,...,...,...,...
2597483,5941,acc_9524,ATM_CASH,8.42,2023-01-25,ATM_CASH
2597484,5941,acc_9524,ATM_CASH,2.06,2023-01-25,ATM_CASH
2597485,5941,acc_9524,ATM_CASH,262.88,2023-01-25,ATM_CASH
2597486,5941,acc_9524,ATM_CASH,10.00,2023-01-25,ATM_CASH


In [35]:
# Filter out rows with 'memo' uncleaned
outflow_cleaned = outflow[outflow['memo'] != outflow['category']]

# Lower case all values in memo
outflow_cleaned.loc[:, 'memo'] = outflow_cleaned['memo'].apply(lambda x: x.lower())

# Remove special characters and numbers
outflow_cleaned.loc[:, 'memo'] = outflow_cleaned['memo'].apply(lambda x: re.sub(r'[^a-z\s]', ' ', x))

# Remove placeholders
outflow_cleaned.loc[:, 'memo'] = outflow_cleaned['memo'].apply(lambda x: re.sub(r'xxx+', ' ', x))

# Remove extra spaces
outflow_cleaned.loc[:, 'memo'] = outflow_cleaned['memo'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

outflow_data = outflow_cleaned.copy()

# FastText Model

In [36]:
customer_id = outflow_data['prism_consumer_id'].unique()

train_id, test_id = train_test_split(customer_id, test_size= 0.25, random_state = 42)

train_data = outflow_data[outflow_data['prism_consumer_id'].isin(train_id)]
test_data = outflow_data[outflow_data['prism_consumer_id'].isin(test_id)]

In [37]:
outflow_data

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
2,0,acc_0,tst casa del rio exp fairlawn oh,18.42,2022-09-26,FOOD_AND_BEVERAGES
4,0,acc_0,buffalo wild wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
6,0,acc_0,oculus ca,11.73,2022-04-18,GENERAL_MERCHANDISE
7,0,acc_0,los girasoles stow oh,30.04,2022-03-09,FOOD_AND_BEVERAGES
8,0,acc_0,buzzis laundry oh,4.16,2022-03-29,GENERAL_MERCHANDISE
...,...,...,...,...,...,...
2597457,5941,acc_9524,debit card withdrawal purchaseamazon prime ti ...,15.93,2023-01-16,GENERAL_MERCHANDISE
2597462,5941,acc_9524,pos withdrawalaz lot quiktrip e indian school ...,25.00,2023-01-18,EDUCATION
2597465,5941,acc_9524,pos withdrawalwal mart e mckellips rd mesa az ...,3.68,2023-01-18,FOOD_AND_BEVERAGES
2597468,5941,acc_9524,withdrawal salt river projetype online pmt co ...,90.00,2023-01-20,FOOD_AND_BEVERAGES


In [None]:

# Create a new column for the training data in the format of FastText
df = train_data.copy()

df['fasttext_format'] = df['category'].apply(lambda x: f"__label__{x}") + ' ' + df['memo'].apply(str)

# Save to a text file
df[['fasttext_format']].to_csv('train_data.txt', header=False, index=False, sep='\n')

In [None]:
# Create a new column for the training data in the format of FastText
df = test_data.copy()

df['fasttext_format'] = df['category'].apply(lambda x: f"__label__{x}") + ' ' + df['memo'].apply(str)

# Save to a text file
df[['fasttext_format']].to_csv('test_data.txt', header=False, index=False, sep='\n')

In [47]:
# Train the FastText model using the prepared training data
model = fasttext.train_supervised(input="train_data.txt", epoch=30, lr=0.1, wordNgrams=2)

# Save the trained model for later use
model.save_model("transaction_category_model.bin")

Read 7M words
Number of words:  68092
Number of labels: 9
Progress: 100.0% words/sec/thread:  970974 lr:  0.000000 avg.loss:  0.010750 ETA:   0h 0m 0s


In [None]:
# Test the model on test set
test_model = fasttext.load_model("transaction_category_model.bin")

result = test_model.test("test_data.txt")

print(f"Test Accuracy: {result[1]}")

Test Accuracy: 0.9690070252326403


In [76]:
# Test the model on train set(check for signs of overfitting)
test_model = fasttext.load_model("transaction_category_model.bin")

result = test_model.test("train_data.txt")

print(f"Train Accuracy: {result[1]}")

Train Accuracy: 0.9993843240868505


In [49]:
# Predict the category for a new transaction memo
text = "pos withdrawal walmart"
prediction = model.predict(text)

# The prediction returns a tuple (labels, probabilities)
print(f"Predicted Category: {prediction[0][0]} with probability {prediction[1][0]}")


Predicted Category: __label__GROCERIES with probability 0.9982520341873169


In [83]:
predictions = []

for x in y_test:
    predictions.append(model.predict(x))

predictions

[(('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__FOOD_AND_BEVERAGES',), array([1.00001001])),
 (('__label__F

In [82]:
y_test

1894201    GENERAL_MERCHANDISE
1588081     FOOD_AND_BEVERAGES
48158       FOOD_AND_BEVERAGES
2254084     FOOD_AND_BEVERAGES
1252749     FOOD_AND_BEVERAGES
                  ...         
1010780              GROCERIES
799538     GENERAL_MERCHANDISE
1580527    GENERAL_MERCHANDISE
798908      FOOD_AND_BEVERAGES
1656190     FOOD_AND_BEVERAGES
Name: category, Length: 326613, dtype: object

In [81]:
len(predictions)

1306452

# XGBoost

In [62]:
X = outflow_data['memo']
y = outflow_data['category']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=4000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# Encode transaction labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [69]:
X_train

543261                  checkcard universal orlando websi fl
337469                   checkcard litaf itt lit rock afb ar
792030                                         feednix tacos
53710                                      apple com bill ca
1838856                         checkcard wal mart polson mt
                                 ...                        
239698                              citizens cafe westwood m
564028     purchase authorized on sunset service llc seat...
289812                                                target
1389407                                       apple com bill
268584                      pos withdrawal dollarshave dolla
Name: memo, Length: 979839, dtype: object

In [67]:
X_test

1894201                     idl wheel tenant llc orlan do fl
1588081                 checkcard lifevantage corporation ut
48158                                             mcdonald s
2254084                                moe s jacksonville fl
1252749                                                sonic
                                 ...                        
1010780                                               publix
799538                                           great clips
1580527    purchase authorized on family dollar atlanta g...
798908                                           chick fil a
1656190    pos debit visa check card mcdonald s mary e mu...
Name: memo, Length: 326613, dtype: object

In [73]:
# Train XGBoost model
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss'
)
xgb_model.fit(X_train_tfidf, y_train_encoded)

# Predict and evaluate the model
y_pred_xgb = xgb_model.predict(X_test_tfidf)

# Inverse transform to get the original labels
y_pred_xgb_decoded = label_encoder.inverse_transform(y_pred_xgb)

# Print the accuracy
print(f"XGBoost Accuracy: {accuracy_score(y_test_encoded, y_pred_xgb)}")

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9198990854619994


# CatBoost

In [74]:
# Train CatBoost model
catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, cat_features=[])
catboost_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_catboost = catboost_model.predict(X_test_tfidf)
print(f"CatBoost Accuracy: {accuracy_score(y_test, y_pred_catboost)}")

0:	learn: 1.8869541	total: 849ms	remaining: 14m 8s
1:	learn: 1.7096344	total: 1.66s	remaining: 13m 49s
2:	learn: 1.5727430	total: 2.42s	remaining: 13m 25s
3:	learn: 1.4647761	total: 3.15s	remaining: 13m 5s
4:	learn: 1.3845290	total: 3.9s	remaining: 12m 56s
5:	learn: 1.3145859	total: 4.63s	remaining: 12m 46s
6:	learn: 1.2523914	total: 5.35s	remaining: 12m 39s
7:	learn: 1.2012453	total: 6.07s	remaining: 12m 32s
8:	learn: 1.1557617	total: 6.78s	remaining: 12m 26s
9:	learn: 1.1140061	total: 7.47s	remaining: 12m 19s
10:	learn: 1.0744085	total: 8.15s	remaining: 12m 13s
11:	learn: 1.0449546	total: 8.89s	remaining: 12m 12s
12:	learn: 1.0182186	total: 9.61s	remaining: 12m 9s
13:	learn: 0.9948220	total: 10.3s	remaining: 12m 6s
14:	learn: 0.9739941	total: 11s	remaining: 12m 2s
15:	learn: 0.9550544	total: 11.7s	remaining: 11m 58s
16:	learn: 0.9371684	total: 12.4s	remaining: 11m 55s
17:	learn: 0.9209747	total: 13s	remaining: 11m 50s
18:	learn: 0.9070615	total: 13.7s	remaining: 11m 48s
19:	learn: 0.