In [31]:
import numpy as np
import pandas as pd

In [32]:
df = pd.read_csv('Data_Datasets_Multi-label_dataset_gpt_multi_label_32000.csv')

df

Unnamed: 0,review,feature request,bug report,rating,user experience,ID
0,This is a great source of information. I would...,1,0,0,1,108375
1,This is probably the most enjoyable game I've ...,0,0,1,1,3500
2,==MOST WELCOME IN CHAMPCASH== Champcash is an ...,0,0,1,0,77617
3,This app is so bright it hurts. You're trying ...,1,0,1,1,85647
4,Devs this is an awesome app. I've always been...,1,0,0,0,66698
...,...,...,...,...,...,...
31995,Takes up too much memory... Anything that forc...,0,0,0,1,203042
31996,Do not install this! It made my phone go nuts....,0,0,1,0,37583
31997,Lags on lockscreen I almost absolutely love Mu...,0,0,0,1,29910
31998,How to set featured image? Fix this problem Ne...,1,1,0,0,85066


In [33]:
from datasets import Dataset, DatasetDict
dataset = Dataset.from_pandas(df)

In [34]:
dataset

Dataset({
    features: ['review', 'feature request', 'bug report', 'rating', 'user experience', 'ID'],
    num_rows: 32000
})

In [35]:
ds1 = dataset.train_test_split(test_size=0.2)
ds2 = ds1['test'].train_test_split(test_size=0.5)

dataset = DatasetDict({
    'train': ds1['train'],
    'test': ds2['train'],
    'valid': ds2['test']})

In [36]:
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'feature request', 'bug report', 'rating', 'user experience', 'ID'],
        num_rows: 25600
    })
    test: Dataset({
        features: ['review', 'feature request', 'bug report', 'rating', 'user experience', 'ID'],
        num_rows: 3200
    })
    valid: Dataset({
        features: ['review', 'feature request', 'bug report', 'rating', 'user experience', 'ID'],
        num_rows: 3200
    })
})

In [37]:
dataset['train'][0]

{'review': 'Screen flips then back to home Did everything right. Even took advice putting bios into android/data/com.reicast ect. directory from site. Please update a fix. Thanks s5 active version 5.0',
 'feature request': 0,
 'bug report': 0,
 'rating': 0,
 'user experience': 0,
 'ID': 4839}

In [38]:
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'review']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [39]:
labels

['feature request', 'bug report', 'rating', 'user experience']

In [40]:
id2label

{0: 'feature request', 1: 'bug report', 2: 'rating', 3: 'user experience'}

In [41]:
label2id

{'feature request': 0, 'bug report': 1, 'rating': 2, 'user experience': 3}

In [42]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["review"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [43]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/25600 [00:00<?, ? examples/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

In [44]:
encoded_dataset['train'][8].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [45]:
tokenizer.decode(encoded_dataset['train'][8]['input_ids'])

"[CLS] lacks simplicity i cannot add a whole folder at one and still i selected 500 wallpapers to add and when i tapped add it didn't add anything. uninstalling this right moment.[SEP]<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"

In [46]:
encoded_dataset['train'][8]['labels']

[0.0, 0.0, 0.0, 0.0]

In [47]:
[id2label[idx] for idx, label in enumerate(encoded_dataset['train'][8]['labels']) if label == 1.0]

[]

In [48]:
encoded_dataset.set_format("torch")

In [49]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("albert-base-v2", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=64,
                        lora_dropout=0.1,
                        target_modules = ["attention.query", "attention.key", "attention.value", "ffn", "attention.dense"])
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 43,012 || all params: 11,729,672 || trainable%: 0.3667


In [51]:
batch_size = 16
metric_name = "accuracy"

from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    lr_scheduler_type="cosine",  # learning rate scheduler type
    warmup_ratio=0.1  # warmup ratio for lr scheduler
)



In [52]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
from transformers import EarlyStoppingCallback
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

# Add EarlyStoppingCallback to the trainer
early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

In [53]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.5895,0.391255,0.689531,0.764148,0.49125
2,0.2003,0.135896,0.940938,0.945162,0.8625
3,0.1134,0.078013,0.971598,0.972661,0.93125
4,0.0815,0.078042,0.973226,0.974037,0.935312
5,0.0766,0.075043,0.973432,0.974905,0.93625
6,0.0688,0.054826,0.980478,0.981949,0.9525
7,0.0507,0.049015,0.982836,0.984234,0.96
8,0.0962,0.048201,0.983767,0.98426,0.960625
9,0.04,0.030647,0.988712,0.989256,0.973125
10,0.0306,0.032854,0.989566,0.989973,0.975313




TrainOutput(global_step=16000, training_loss=0.073971287637949, metrics={'train_runtime': 2734.0549, 'train_samples_per_second': 187.268, 'train_steps_per_second': 5.852, 'total_flos': 3076474798080000.0, 'train_loss': 0.073971287637949, 'epoch': 20.0})

In [54]:
trainer.evaluate()



{'eval_loss': 0.02708737552165985,
 'eval_f1': 0.9907923431063727,
 'eval_roc_auc': 0.9912373334694198,
 'eval_accuracy': 0.978125,
 'eval_runtime': 8.2853,
 'eval_samples_per_second': 386.224,
 'eval_steps_per_second': 12.07,
 'epoch': 20.0}