In [13]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
file_path = '../data/dataset_with_subcategories.csv'  
df = pd.read_csv(file_path)

print(df.head())

                                              amount  \
0  The amount involved was 2500 dollars with my f...   
1  An amount of 1500 dollars was transferred to A...   
2                   I paid 2500 dollars to a company   
3  The amount involved was 2000 dollars with my c...   
4  An amount of 100 dollars was transferred to AC...   

                                involvement  \
0                    I dealt with a company   
1            my colleague was the recipient   
2   a store was involved in the transaction   
3  a vendor was involved in the transaction   
4         I conducted business with a store   

                            payment_method  \
0                        I used google pay   
1                    I paid through Paypal   
2  I used merchant one for the transaction   
3                    I paid using cash app   
4          I transferred money using Venmo   

                                   transaction_type                   category  
0                     

In [6]:
# Encode the target column 'category'
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

In [7]:
df['category_encoded']

0        12
1        22
2        24
3        24
4        24
         ..
68792    22
68793     9
68794    24
68795    14
68796     7
Name: category_encoded, Length: 68797, dtype: int32

In [8]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
# Dataset class to handle text inputs for all columns
class TransactionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        # Take all columns as text inputs
        amount = str(self.dataframe.iloc[index]['amount'])
        involvement = str(self.dataframe.iloc[index]['involvement'])
        payment_method = str(self.dataframe.iloc[index]['payment_method'])
        transaction_type = str(self.dataframe.iloc[index]['transaction_type'])
        category = self.dataframe.iloc[index]['category_encoded']

        # Concatenate all columns into a single text string
        text = f"Amount: {amount}. Involvement: {involvement}. Payment Method: {payment_method}. Transaction Type: {transaction_type}"

        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(category, dtype=torch.long)
        }

In [10]:
# Parameters
MAX_LEN = 128
BATCH_SIZE = 16

In [11]:
# Split the data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Create datasets
train_dataset = TransactionDataset(train_df, tokenizer, MAX_LEN)
val_dataset = TransactionDataset(val_df, tokenizer, MAX_LEN)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [14]:
# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['category_encoded'].unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Move model to device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [16]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)



In [17]:
# Training function
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), sum(losses) / len(losses)


In [18]:
# Validation function
def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), sum(losses) / len(losses)

In [None]:
# Training loop
EPOCHS = 3
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f'Train loss {train_loss}, accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, val_loader, device)
    print(f'Validation loss {val_loss}, accuracy {val_acc}')

# Model evaluation on validation data
val_acc, val_loss = eval_model(model, val_loader, device)
print(f'Validation accuracy: {val_acc}, loss: {val_loss}')

In [None]:
import nbformat
from nbconvert import PythonExporter

# Load the notebook file
with open("BERT.ipynb", "r", encoding="utf-8") as f:
    notebook_content = nbformat.read(f, as_version=4)

# Convert to Python script
python_exporter = PythonExporter()
python_code, _ = python_exporter.from_notebook_node(notebook_content)

# Write the Python script to a .py file
with open("BERT.py", "w", encoding="utf-8") as f:
    f.write(python_code)