In [1]:
import pandas as pd
import numpy as np
import re
from datasets import Dataset
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


In [2]:
# Read datasets
outflow = pd.read_parquet("ucsd-outflows.pqt")
inflow = pd.read_parquet("ucsd-inflows.pqt")

In [3]:
# Create a subset for testing
inflow_subset = inflow.sample(n=12500, random_state=42)
outflow_subset = outflow.sample(n=12500, random_state=42)
inflow = inflow_subset
outflow = outflow_subset

In [4]:
# Filter out rows with 'memo' uncleaned
outflow_cleaned = outflow[outflow['memo'] != outflow['category']]

# Lower case all values in memo
outflow_cleaned.loc[:, 'memo'] = outflow_cleaned['memo'].apply(lambda x: x.lower())

# Remove special characters and numbers
outflow_cleaned.loc[:, 'memo'] = outflow_cleaned['memo'].apply(lambda x: re.sub(r'[^a-z\s]', ' ', x))

# Remove placeholders
outflow_cleaned.loc[:, 'memo'] = outflow_cleaned['memo'].apply(lambda x: re.sub(r'xxx+', ' ', x))

# Remove extra spaces
outflow_cleaned.loc[:, 'memo'] = outflow_cleaned['memo'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

outflow_data = outflow_cleaned.copy()

In [5]:
# Create a mapping from category names to integer labels
category_mapping = {category: idx for idx, category in enumerate(outflow_data['category'].unique())}

# Apply this mapping to create a new integer labels column
outflow_data['labels'] = outflow_data['category'].map(category_mapping)

In [6]:
# Get unique customer IDs from the DataFrame first
unique_customer_ids = outflow_data['prism_consumer_id'].unique()

# Split customer IDs into training and test sets
train_ids, test_ids = train_test_split(unique_customer_ids, test_size=0.2, random_state=42)

# Convert the DataFrame to a Hugging Face Dataset after splitting
dataset = Dataset.from_pandas(outflow_data)

# Filter the dataset by customer ID
train_data = dataset.filter(lambda x: x['prism_consumer_id'] in train_ids)
test_data = dataset.filter(lambda x: x['prism_consumer_id'] in test_ids)

Filter:   0%|          | 0/6292 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6292 [00:00<?, ? examples/s]

In [7]:
# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the data and retain labels
def tokenize_data(batch):
    tokenized_batch = tokenizer(batch['memo'], padding='max_length', truncation=True)
    tokenized_batch["labels"] = batch["labels"]
    return tokenized_batch

# Tokenize dataset
train_data = train_data.map(tokenize_data, batched=True)
test_data = test_data.map(tokenize_data, batched=True)


Map:   0%|          | 0/5027 [00:00<?, ? examples/s]

Map:   0%|          | 0/1265 [00:00<?, ? examples/s]

In [8]:
train_data = train_data.rename_column("memo", "text")
test_data = test_data.rename_column("memo", "text")

print(train_data.column_names)
print(test_data.column_names)

['prism_consumer_id', 'prism_account_id', 'text', 'amount', 'posted_date', 'category', 'labels', '__index_level_0__', 'input_ids', 'attention_mask']
['prism_consumer_id', 'prism_account_id', 'text', 'amount', 'posted_date', 'category', 'labels', '__index_level_0__', 'input_ids', 'attention_mask']


In [9]:
# Define number of labels (categories)
num_labels = len(outflow_data['category'].unique())

# Load DistilBERT with a classification layer
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="no",     
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,  
    num_train_epochs=3,            
    weight_decay=0.01,            
)

# Define the Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,             
    train_dataset=train_data,       
    eval_dataset=test_data              
)

# Train the model
trainer.train()



  0%|          | 0/945 [00:00<?, ?it/s]

{'loss': 0.5077, 'grad_norm': 3.6194894313812256, 'learning_rate': 2.3544973544973546e-05, 'epoch': 1.59}
{'train_runtime': 818.6952, 'train_samples_per_second': 18.421, 'train_steps_per_second': 1.154, 'train_loss': 0.34212417198867395, 'epoch': 3.0}


TrainOutput(global_step=945, training_loss=0.34212417198867395, metrics={'train_runtime': 818.6952, 'train_samples_per_second': 18.421, 'train_steps_per_second': 1.154, 'total_flos': 1997990227233792.0, 'train_loss': 0.34212417198867395, 'epoch': 3.0})

In [11]:
trainer.evaluate()

  0%|          | 0/80 [00:00<?, ?it/s]

{'eval_loss': 0.4670441448688507,
 'eval_runtime': 24.3881,
 'eval_samples_per_second': 51.869,
 'eval_steps_per_second': 3.28,
 'epoch': 3.0}

In [12]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {"accuracy": acc, "f1": f1}

# Update the Trainer with custom compute_metrics
trainer.compute_metrics = compute_metrics

# Re-run evaluation
eval_results = trainer.evaluate()

  0%|          | 0/80 [00:00<?, ?it/s]

In [13]:
# Save metrics to a text file
output_file = "evaluation_metrics.txt"
with open(output_file, "w") as file:
    file.write("Evaluation Metrics on Test Set\n")
    file.write("=============================\n")
    for key, value in eval_results.items():
        file.write(f"{key}: {value}\n")

Evaluation Metrics on Test Set
=============================
eval_loss: 0.4670441448688507  
eval_accuracy: 0.8956521739130435  
eval_f1: 0.8930265530973801  
eval_runtime: 23.137  
eval_samples_per_second: 54.674  
eval_steps_per_second: 3.458  
epoch: 3.0  


In [14]:
# Reverse category_mapping to create index_to_category
index_to_category = {v: k for k, v in category_mapping.items()}

In [21]:
outflow_cleaned.iloc[0]['memo']

'purchase authorized on fordham deli bronx ny s card'

In [22]:
import torch

# Create a new transaction memo
new_memo = outflow_cleaned.iloc[0]['memo']


model.to('mps')

# Tokenize and move inputs to MPS
inputs = tokenizer(new_memo, return_tensors="pt", padding=True, truncation=True)
inputs = {key: value.to('mps') for key, value in inputs.items()}

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(-1)
    predicted_category = predictions.item()

print("Predicted Category:", index_to_category[predicted_category])


Predicted Category: FOOD_AND_BEVERAGES


In [17]:
torch.cuda.is_available()

False

In [18]:
print(category_mapping)


{'FOOD_AND_BEVERAGES': 0, 'GROCERIES': 1, 'GENERAL_MERCHANDISE': 2, 'EDUCATION': 3, 'TRAVEL': 4, 'OVERDRAFT': 5, 'PETS': 6, 'RENT': 7, 'MORTGAGE': 8}
