#### Loading the dataset and necessary libraries

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np

# Loading the dataset
data_files = {"train": "../../data_splits/train-data-split.csv", "test": "../../data_splits/test-data-split.csv"}
dataset = load_dataset("csv", data_files=data_files)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Consumer complaint narrative', 'Product', 'Sub-product', 'Issue', 'Sub-issue'],
        num_rows: 185637
    })
    test: Dataset({
        features: ['Consumer complaint narrative', 'Product', 'Sub-product', 'Issue', 'Sub-issue'],
        num_rows: 61880
    })
})

#### Preparing the train and test set

In [4]:
issue_categories = np.unique(dataset['train']['Issue'])
issue_mapping = {k : i for i,k in enumerate(issue_categories)}

# Encoding the labels
def encode_categories(batch):
    return {"labels": [issue_mapping[category] for category in batch['Issue']]}
    
dataset['train'] = dataset['train'].map(encode_categories , batched=True)
dataset['test'] = dataset['test'].map(encode_categories, batched=True)

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Consumer complaint narrative', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'labels'],
        num_rows: 185637
    })
    test: Dataset({
        features: ['Consumer complaint narrative', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'labels'],
        num_rows: 61880
    })
})

#### Loading the tokenizer

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["Consumer complaint narrative"], truncation=True, padding="max_length", max_length=512)

#### Mapping into batches

In [8]:
tokenized_data = dataset.map(preprocess_function, batched=True, remove_columns=['Product', 'Sub-product','Issue', 'Sub-issue'])

Map:   0%|          | 0/61880 [00:00<?, ? examples/s]

In [9]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['Consumer complaint narrative', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 185637
    })
    test: Dataset({
        features: ['Consumer complaint narrative', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 61880
    })
})

In [10]:
tokenized_data = tokenized_data.remove_columns('Consumer complaint narrative')

In [11]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 185637
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 61880
    })
})

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Defining the metrics

In [13]:
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    result = {}

    metric_f1 = load_metric("f1")
    metric_precision = load_metric("precision")
    metric_recall = load_metric("recall")
    metric_acc = load_metric("accuracy")
    
    result["accuracy"] = metric_acc.compute(predictions = predictions, references = labels)["accuracy"]
    result["precision"] = metric_precision.compute(predictions = predictions, references = labels,average = 'macro')['precision']
    result["recall"] = metric_recall.compute(predictions = predictions, references = labels,average = 'macro')["recall"]
    result["f1"] = metric_f1.compute(predictions = predictions, references = labels, average = 'macro')["f1"]
    
    return result

In [14]:
id2label = {v : k for k,v in issue_mapping.items()}
label2id = issue_mapping

#### Loading the pre-trained model

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(id2label), id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Defining training arguments
training_args = TrainingArguments(
    output_dir="distil-bert-fintuned-issue-cfpb-complaints",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


#### Fine-tuning the model

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7418,0.732448,0.74213,0.53187,0.4427,0.453331


  metric_f1 = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=11603, training_loss=0.8782807422241117, metrics={'train_runtime': 2334.134, 'train_samples_per_second': 79.531, 'train_steps_per_second': 4.971, 'total_flos': 2.460093697119744e+16, 'train_loss': 0.8782807422241117, 'epoch': 1.0})

#### Getting the predictions

In [18]:
predictions = trainer.predict(tokenized_data['test'])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
#Pushing the model to hub
trainer.push_to_hub("Mahesh9/distil-bert-fintuned-issue-cfpb-complaints")

CommitInfo(commit_url='https://huggingface.co/Mahesh9/distil-bert-fintuned-issue-cfpb-complaints/commit/c2d64d57bb7eaae79de58bbbfc5ef393cd091c2e', commit_message='Mahesh9/distil-bert-fintuned-issue-cfpb-complaints', commit_description='', oid='c2d64d57bb7eaae79de58bbbfc5ef393cd091c2e', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
predicted_categories = np.argmax(predictions.predictions, axis=-1)
true_categories = [label2id[label] for label in dataset["test"]["Issue"]]

#### Classification report

In [23]:
report = classification_report(true_categories, predicted_categories, target_names=product_categories)
print(report)

                                                                                  precision    recall  f1-score   support

                                               Attempts to collect debt not owed       0.50      0.60      0.54      1381
                                                              Closing an account       0.66      0.58      0.62       521
                                                            Closing your account       0.49      0.67      0.56       191
                         Credit monitoring or identity theft protection services       1.00      0.28      0.44       115
                                            Dealing with your lender or servicer       0.96      0.95      0.96       314
                                              False statements or representation       0.00      0.00      0.00       202
                                                                Fees or interest       0.56      0.55      0.56       186
                       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
# Saving the results
import pandas as pd

true_labels_text = [id2label[label] for label in true_categories]
predicted_labels_text = [id2label[label] for label in predicted_categories]

results_df = pd.DataFrame({
    'True Labels': true_labels_text,
    'Predicted Labels': predicted_labels_text
})

original_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in tokenized_data["test"]["input_ids"]]
results_df['Complaint Narrative'] = original_texts

In [25]:
# Dumping the saved results into a csv
results_df.to_csv('distil-bert-fintuned-issue-cfpb-complaints/saved_results/model_predictions_and_labels.csv', index=False)