In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fixed-transactions/amex_all_categorized_raw.csv
/kaggle/input/synthetic-transactions/Synthetic_transactions_data.csv
/kaggle/input/preclassified/Amex Categorized Raw.csv
/kaggle/input/transactions/amex_all_categorized_raw.csv
/kaggle/input/no-negative-transactions/amex_all_categorized_raw.csv


## BERT BASE

In [5]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import pickle

# Load the dataset
df = pd.read_csv('/kaggle/input/no-negative-transactions/amex_all_categorized_raw.csv')
df = df[df['Description'] != 'Description'] 

# Encode categories
label_encoder = LabelEncoder()
df['Category_encoded'] = label_encoder.fit_transform(df['Category'])

# Save the encoder
with open('/kaggle/working/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
input_ids = []
attention_masks = []

for desc in df['Description']:
    encoded_dict = tokenizer.encode_plus(
        desc,                              # Sentence to encode
        add_special_tokens = True,         # Add '[CLS]' and '[SEP]'
        max_length = 64,                   # Pad & truncate all sentences
        pad_to_max_length = True,
        return_attention_mask = True,      # Construct attention masks
        return_tensors = 'pt',             # Return pytorch tensors
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(df['Category_encoded'].values)

# Splitting the dataset
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=42, test_size=0.1)

# Convert to DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=32)


# Define the custom model
class BertForSequenceClassificationCustom(nn.Module):
    def __init__(self, num_labels):
        super(BertForSequenceClassificationCustom, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Initialize the model
num_labels = len(df['Category'].unique())
model = BertForSequenceClassificationCustom(num_labels)

# Define the evaluation function
def evaluate_model(model, dataloader, device):
    model.eval()
    total_eval_accuracy = 0
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)
        
        logits = outputs.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()

        total_eval_accuracy += np.sum(predictions == labels_flat) / len(labels_flat)
    
    return total_eval_accuracy / len(dataloader)

# Before fine-tuning, evaluate the model to get the initial accuracy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

initial_accuracy = evaluate_model(model, validation_dataloader, device)
print(f'Initial Validation Accuracy: {initial_accuracy:.4f}')

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 25  # Adjust the number of epochs here
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
model.train()
for epoch_i in range(0, epochs):  # Loop through 15 epochs
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        logits = model(b_input_ids, attention_mask=b_input_mask)
        
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, b_labels)
        
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f" Average training loss: {avg_train_loss:.2f}")

# After fine-tuning, evaluate the model again to get the new accuracy
final_accuracy = evaluate_model(model, validation_dataloader, device)
print(f'Final Validation Accuracy: {final_accuracy:.4f}')

model_save_path = "/kaggle/working/BERT_BASE_ft_model.pth"
torch.save(model.state_dict(), model_save_path)

print("Training complete")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Initial Validation Accuracy: 0.0078




 Average training loss: 2.49
 Average training loss: 2.01
 Average training loss: 1.70
 Average training loss: 1.45
 Average training loss: 1.22
 Average training loss: 1.00
 Average training loss: 0.81
 Average training loss: 0.65
 Average training loss: 0.55
 Average training loss: 0.46
 Average training loss: 0.39
 Average training loss: 0.35
 Average training loss: 0.30
 Average training loss: 0.28
 Average training loss: 0.26
 Average training loss: 0.22
 Average training loss: 0.22
 Average training loss: 0.21
 Average training loss: 0.20
 Average training loss: 0.19
 Average training loss: 0.18
 Average training loss: 0.18
 Average training loss: 0.17
 Average training loss: 0.16
 Average training loss: 0.16
Final Validation Accuracy: 0.8471
Training complete


## BERT LARGE

In [3]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn

# Load the dataset
df = pd.read_csv('/kaggle/input/no-negative-transactions/amex_all_categorized_raw.csv')
df = df[df['Description'] != 'Description']

# Encode categories
label_encoder = LabelEncoder()
df['Category_encoded'] = label_encoder.fit_transform(df['Category'])

# Initialize the tokenizer with bert-large-uncased
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# Tokenize the dataset
input_ids = []
attention_masks = []

for desc in df['Description']:
    encoded_dict = tokenizer.encode_plus(
        desc,                              # Sentence to encode
        add_special_tokens = True,         # Add '[CLS]' and '[SEP]'
        max_length = 64,                   # Pad & truncate all sentences
        pad_to_max_length = True,
        return_attention_mask = True,      # Construct attention masks
        return_tensors = 'pt',             # Return pytorch tensors
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(df['Category_encoded'].values)

# Splitting the dataset
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=42, test_size=0.1)

# Convert to DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=32)

# Define the custom model with BertModel using bert-large-uncased
class BertForSequenceClassificationCustom(nn.Module):
    def __init__(self, num_labels):
        super(BertForSequenceClassificationCustom, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-large-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(1024, num_labels) # Update the size for bert-large-uncased

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Initialize the model
num_labels = len(df['Category'].unique())
model = BertForSequenceClassificationCustom(num_labels)

# (The rest of the code remains unchanged)

# Define the evaluation function
def evaluate_model(model, dataloader, device):
    model.eval()
    total_eval_accuracy = 0
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)
        
        logits = outputs.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()

        total_eval_accuracy += np.sum(predictions == labels_flat) / len(labels_flat)
    
    return total_eval_accuracy / len(dataloader)

# Before fine-tuning, evaluate the model to get the initial accuracy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

initial_accuracy = evaluate_model(model, validation_dataloader, device)
print(f'Initial Validation Accuracy: {initial_accuracy:.4f}')

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 25  # Adjust the number of epochs here
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
model.train()
for epoch_i in range(0, epochs):  # Loop through 15 epochs
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        logits = model(b_input_ids, attention_mask=b_input_mask)
        
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, b_labels)
        
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f" Average training loss: {avg_train_loss:.2f}")

# After fine-tuning, evaluate the model again to get the new accuracy
final_accuracy = evaluate_model(model, validation_dataloader, device)
print(f'Final Validation Accuracy: {final_accuracy:.4f}')

model_save_path = "/kaggle/working/BERT_LARGE_ft_model.pth"
torch.save(model.state_dict(), model_save_path)

print("Training complete")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Initial Validation Accuracy: 0.0625




 Average training loss: 2.42
 Average training loss: 2.05
 Average training loss: 1.63
 Average training loss: 1.15
 Average training loss: 0.86
 Average training loss: 0.66
 Average training loss: 0.49
 Average training loss: 0.41
 Average training loss: 0.34
 Average training loss: 0.27
 Average training loss: 0.25
 Average training loss: 0.21
 Average training loss: 0.19
 Average training loss: 0.19
 Average training loss: 0.18
 Average training loss: 0.16
 Average training loss: 0.16
 Average training loss: 0.14
 Average training loss: 0.13
 Average training loss: 0.14
 Average training loss: 0.13
 Average training loss: 0.12
 Average training loss: 0.11
 Average training loss: 0.11
 Average training loss: 0.11
Final Validation Accuracy: 0.8471
Training complete


## Roberta

In [4]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn

# Load the dataset
df = pd.read_csv('/kaggle/input/no-negative-transactions/amex_all_categorized_raw.csv')
df = df[df['Description'] != 'Description']  # Clean any repeated headers

# Encode categories
label_encoder = LabelEncoder()
df['Category_encoded'] = label_encoder.fit_transform(df['Category'])

# Initialize the tokenizer with roberta-large
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# Tokenize the dataset
input_ids = []
attention_masks = []

for desc in df['Description']:
    encoded_dict = tokenizer.encode_plus(
        desc,                              # Sentence to encode
        add_special_tokens = True,         # Add '[CLS]' and '[SEP]'
        max_length = 64,                   # Pad & truncate all sentences
        pad_to_max_length = True,
        return_attention_mask = True,      # Construct attention masks
        return_tensors = 'pt',             # Return pytorch tensors
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(df['Category_encoded'].values)

# Splitting the dataset
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=42, test_size=0.1)

# Convert to DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=32)

# Define the custom model with RobertaModel
class RobertaForSequenceClassificationCustom(nn.Module):
    def __init__(self, num_labels):
        super(RobertaForSequenceClassificationCustom, self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained('roberta-large')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(1024, num_labels) # Ensure the size matches roberta-large's output features

    def forward(self, input_ids, attention_mask=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Initialize the model
num_labels = len(df['Category'].unique())
model = RobertaForSequenceClassificationCustom(num_labels)

# Define the evaluation function
def evaluate_model(model, dataloader, device):
    model.eval()
    total_eval_accuracy = 0
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)
        
        logits = outputs.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()

        total_eval_accuracy += np.sum(predictions == labels_flat) / len(labels_flat)
    
    return total_eval_accuracy / len(dataloader)

# Before fine-tuning, evaluate the model to get the initial accuracy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

initial_accuracy = evaluate_model(model, validation_dataloader, device)
print(f'Initial Validation Accuracy: {initial_accuracy:.4f}')

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 25  # Adjust the number of epochs here
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
model.train()
for epoch_i in range(0, epochs):  # Loop through 15 epochs
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        logits = model(b_input_ids, attention_mask=b_input_mask)
        
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, b_labels)
        
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f" Average training loss: {avg_train_loss:.2f}")

# After fine-tuning, evaluate the model again to get the new accuracy
final_accuracy = evaluate_model(model, validation_dataloader, device)
print(f'Final Validation Accuracy: {final_accuracy:.4f}')

model_save_path = "/kaggle/working/RoBERTa_LARGE_ft_model.pth"
torch.save(model.state_dict(), model_save_path)

print("Training complete")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initial Validation Accuracy: 0.0703




 Average training loss: 2.28
 Average training loss: 1.57
 Average training loss: 0.96
 Average training loss: 0.67
 Average training loss: 0.51
 Average training loss: 0.38
 Average training loss: 0.32
 Average training loss: 0.28
 Average training loss: 0.24
 Average training loss: 0.22
 Average training loss: 0.18
 Average training loss: 0.18
 Average training loss: 0.17
 Average training loss: 0.17
 Average training loss: 0.16
 Average training loss: 0.14
 Average training loss: 0.13
 Average training loss: 0.13
 Average training loss: 0.13
 Average training loss: 0.12
 Average training loss: 0.11
 Average training loss: 0.10
 Average training loss: 0.10
 Average training loss: 0.10
 Average training loss: 0.10
Final Validation Accuracy: 0.8549
Training complete
