In [None]:
!pip install -U datasets transformers

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

This code verifies if a GPU is available and prints its name if found. This is important for accelerating the training process.

In [None]:
!nvidia-smi

In [None]:
import torch

# Check if GPU is available
print("GPU Available:", torch.cuda.is_available())

# Print GPU name
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

# **Data Loading and Preprocessing**
This section loads the consumer complaints data from a CSV file,
preprocesses it by renaming columns, and creates a label mapping for the target variable.

In [None]:
# 1. Load and Preprocess the Data
# --------------------------------
# Assume you have downloaded the CSV locally (e.g., 'consumer_complaints.csv').
# Here, we load the CSV, rename columns to standardize (e.g. lower-case with underscores),
# and then create a label mapping for the target variable ("product").

complaints_df = pd.read_csv("/content/complaints-2025.csv")

In [None]:
complaints_df.shape

In [None]:
complaints_df.info()

In [None]:
complaints_df.head(10)

In [None]:
issues_df = complaints_df.Issue.value_counts().reset_index()

In [None]:
issues_df.head(10)

In [None]:
num_labels = len(issues_df.Issue.unique())

In [None]:
num_labels


Here, we select the relevant columns for text classification (narrative and issue) and rename them for clarity.


In [None]:
complaints_df = complaints_df[["Consumer complaint narrative", "Issue"]].dropna().reset_index()

In [None]:
complaints_df.info()

In [None]:
df = complaints_df[['Consumer complaint narrative', 'Issue']]
df.columns = ['Narrative', 'labels']

This section converts the pandas DataFrame to a Hugging Face Dataset and encodes the labels for classification.

In [None]:
# Convert the pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

dataset = dataset.class_encode_column("labels")

In [None]:
# Get label mapping as a dictionary
label_mapping = {idx: label for idx, label in enumerate(dataset.features["labels"].names)}

# Print dictionary
label_mapping


In [None]:
# Split into train and test sets (80-20 split)
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

This section defines a function to tokenize the text data using the chosen model's tokenizer. It then applies the tokenization to both training and evaluation datasets.

In [None]:
# 2. Tokenize the Data
# ---------------------
# Load the tokenizer from the chosen model and define a function to tokenize each example.
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    # Tokenize the complaint narrative text.
    return tokenizer(
        example["Narrative"],
        truncation=True,
        padding=True,
        max_length=128  # adjust max_length based on your data and available compute
    )

In [None]:
# Apply the tokenization to both train and evaluation datasets.
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

In [None]:
print(train_dataset.column_names)  # Should include 'input_ids', 'attention_mask', and 'labels'

In [None]:
# Remove unnecessary columns (keeping only the input IDs, attention masks, and label)
columns_to_remove = [col for col in train_dataset.column_names if col not in ["input_ids", "attention_mask", "labels"]]
train_dataset = train_dataset.remove_columns(columns_to_remove)
eval_dataset = eval_dataset.remove_columns(columns_to_remove)

# Set the format for PyTorch (so that the Trainer can work with torch tensors)
train_dataset.set_format("torch")
eval_dataset.set_format("torch")

This function generator creates a compute_metrics function based on the provided configuration.

You can set which metrics to compute by modifying the metric_config dictionary.


In [None]:
# 3. Define a Configurable Metrics Function
# ------------------------------------------
# This function generator creates a compute_metrics function based on the provided configuration.
# You can set which metrics to compute by modifying the metric_config dictionary.
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metrics = {}
    # Always compute accuracy
    metrics["accuracy"] = accuracy_score(labels, predictions)

    return metrics

This section loads the pre-trained FinBERT model, specifying the number of labels for our classification task.


In [None]:
# 4. Load and Configure the Model
# --------------------------------
# For a classification task, we load a pre-trained model for sequence classification and specify the number of labels.
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, ignore_mismatched_sizes=True)

This code defines the hyperparameters and settings for the training process, such as learning rate, batch size, and number of epochs.


In [None]:
# 5. Set Up Training Arguments
# -----------------------------
training_args = TrainingArguments(
    output_dir="./results",             # output directory
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,                 # adjust epochs based on your task
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    fp16=True,
#    tpu_num_cores=8,  # Use all 8 TPU cores
)

This section initializes the Trainer with the model, training arguments, and datasets, and then starts the fine-tuning process.

In [None]:
# 6. Initialize the Trainer and Fine-Tune the Model
# --------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,  # pass our configurable metrics function
    processing_class=tokenizer,
)

# Fine-tune the model
trainer.train()

In [None]:
# Optionally, evaluate the model
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

This code saves the trained model and tokenizer for later use.


In [None]:
# 7. Save the Fine-Tuned Model and Tokenizer
# -------------------------------------------
# After training, save the model and tokenizer so that you can load them later for inference.
model_save_path = "./modelnew"
trainer.save_model(model_save_path)  # Saves the model
tokenizer.save_pretrained(model_save_path)  # Saves the tokenizer

print(f"Model and tokenizer saved to {model_save_path}")


This section demonstrates how to use the trained model to classify new consumer complaints and print the predictions.


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# 8. Load the Saved Model and Tokenizer
# -------------------------------------
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_save_path)

# Create a text classification pipeline using the loaded model and tokenizer
classifier = pipeline("text-classification", model=loaded_model, tokenizer=loaded_tokenizer)

This section demonstrates how to use the trained model to classify new consumer complaints and print the predictions.


In [None]:
# 9. Classify New Consumer Complaints
# ------------------------------------
# List of new complaint texts to classify.
new_complaints = [
    "I am extremely disappointed with the bank's customer service and hidden fees.",
    "My credit card company continues to charge me for services I never signed up for.",
    "The mortgage process was unclear and misleading. I'm not sure I got what I was promised."
]

# Get predictions for the new complaints.
predictions = classifier(new_complaints)

# Print out the predictions.
for complaint, pred in zip(new_complaints, predictions):
    print(f"Complaint: {complaint}")
    print(f"Prediction: {pred}\n")

In [None]:
label_mapping


In [None]:
# Get raw predictions on the test set
predictions_output = trainer.predict(eval_dataset)

# Extract logits and true labels
logits = predictions_output.predictions
true_labels = predictions_output.label_ids

# Convert logits to predicted class indices
predicted_labels = np.argmax(logits, axis=-1)


This section calculates and displays the confusion matrix to evaluate the performance of the trained model.


In [None]:
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Print the raw confusion matrix
print("Confusion Matrix:\n", conf_matrix)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

label_names = list(label_mapping.values())

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=label_names, yticklabels=label_names)

# Labels and title
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()