In [5]:
import numpy as np
import pandas as pd

In [6]:
df = pd.read_csv('Data_Datasets_Multi-label_dataset_gpt_multi_label_32000.csv')

df

Unnamed: 0,review,feature request,bug report,rating,user experience,ID
0,This is a great source of information. I would...,1,0,0,1,108375
1,This is probably the most enjoyable game I've ...,0,0,1,1,3500
2,==MOST WELCOME IN CHAMPCASH== Champcash is an ...,0,0,1,0,77617
3,This app is so bright it hurts. You're trying ...,1,0,1,1,85647
4,Devs this is an awesome app. I've always been...,1,0,0,0,66698
...,...,...,...,...,...,...
31995,Takes up too much memory... Anything that forc...,0,0,0,1,203042
31996,Do not install this! It made my phone go nuts....,0,0,1,0,37583
31997,Lags on lockscreen I almost absolutely love Mu...,0,0,0,1,29910
31998,How to set featured image? Fix this problem Ne...,1,1,0,0,85066


In [15]:
from datasets import Dataset, DatasetDict
dataset = Dataset.from_pandas(df)

In [16]:
dataset

Dataset({
    features: ['review', 'feature request', 'bug report', 'rating', 'user experience', 'ID'],
    num_rows: 32000
})

In [17]:
ds1 = dataset.train_test_split(test_size=0.2)
ds2 = ds1['test'].train_test_split(test_size=0.5)

dataset = DatasetDict({
    'train': ds1['train'],
    'test': ds2['train'],
    'valid': ds2['test']})

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'feature request', 'bug report', 'rating', 'user experience', 'ID'],
        num_rows: 25600
    })
    test: Dataset({
        features: ['review', 'feature request', 'bug report', 'rating', 'user experience', 'ID'],
        num_rows: 3200
    })
    valid: Dataset({
        features: ['review', 'feature request', 'bug report', 'rating', 'user experience', 'ID'],
        num_rows: 3200
    })
})

In [22]:
dataset['train'][0]

{'review': 'Would rearrange swathes of whitespace again. Being able to rearrange entries is a welcome addition  but the updated design is the new poster child for complaints about gratuitous whitespace/padding in Material Design. Maybe tone it down a notch and increase information density slightly.',
 'feature request': 0,
 'bug report': 0,
 'rating': 0,
 'user experience': 0,
 'ID': 280033}

In [25]:
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'review']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [26]:
labels

['feature request', 'bug report', 'rating', 'user experience']

In [27]:
id2label

{0: 'feature request', 1: 'bug report', 2: 'rating', 3: 'user experience'}

In [28]:
label2id

{'feature request': 0, 'bug report': 1, 'rating': 2, 'user experience': 3}

In [31]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["review"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [32]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/25600 [00:00<?, ? examples/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

In [45]:
encoded_dataset['train'][8].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [46]:
tokenizer.decode(encoded_dataset['train'][8]['input_ids'])

'[CLS] how do you do it??? wtf 5. 1. 1 samsung rom cant execute any init. d anymore after using various method looks like it was enabled but the script wont applied in boot.. even i have modified init. rc too it seems not working as the run parts command error.. bt u r the one that solving the mess by the point " " " " " " " " " " " " " " " " emulate init. d " " " " " " " " " " " " " " " " in ur app.. anything in ur advise? [SEP]'

In [47]:
encoded_dataset['train'][8]['labels']

[0.0, 1.0, 0.0, 1.0]

In [48]:
[id2label[idx] for idx, label in enumerate(encoded_dataset['train'][8]['labels']) if label == 1.0]

['bug report', 'user experience']

In [49]:
encoded_dataset.set_format("torch")

In [51]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
batch_size = 8
metric_name = "accuracy"

from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)



In [55]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [56]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.093,0.079811,0.972912,0.973981,0.935937
2,0.0553,0.049516,0.983692,0.984694,0.961562
3,0.0409,0.035529,0.987106,0.988207,0.970313
4,0.031,0.033627,0.987624,0.989027,0.970938
5,0.0256,0.03331,0.98811,0.98919,0.972187




TrainOutput(global_step=8000, training_loss=0.07419220840930939, metrics={'train_runtime': 722.1153, 'train_samples_per_second': 177.257, 'train_steps_per_second': 11.079, 'total_flos': 4239107948544000.0, 'train_loss': 0.07419220840930939, 'epoch': 5.0})

In [57]:
trainer.evaluate()



{'eval_loss': 0.03331023454666138,
 'eval_f1': 0.9881099826603914,
 'eval_roc_auc': 0.9891900335199304,
 'eval_accuracy': 0.9721875,
 'eval_runtime': 3.9785,
 'eval_samples_per_second': 804.324,
 'eval_steps_per_second': 50.27,
 'epoch': 5.0}