In [2]:
import numpy as np 
import torch 
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer, EvalPrediction
from datasets import load_from_disk, concatenate_datasets, DatasetDict
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score


In [2]:
torch.cuda.set_device(0)

In [3]:
dataset = load_from_disk('data/neural_data_post_process/')

train_data = concatenate_datasets([dataset['train'],dataset['valid'].select(range(842)), dataset['test'].select(range(2000))])
valid_data = dataset['valid'].select(range(842,2842,1))
test_data = dataset['test'].select(range(2000,2842,1))

dataset = DatasetDict({
    'train': train_data,
    'test': test_data,
    'valid': valid_data
    }
)

In [4]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [5]:
labels = [label for label in dataset['train'].features.keys() if label not in ['index', 'name','author','text']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

assert len(labels) == 20

In [6]:
def prepare_data(batch):
    text = batch['text']
    encoding = tokenizer(text,padding='max_length', truncation=True, max_length=512)
    labels_batch = {key: batch[key] for key in batch.keys() if key in labels}
    labels_matrix = np.zeros((len(text),len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:,idx] = labels_batch[label]

    encoding['labels'] = labels_matrix.tolist()
    
    return encoding
encoded_dataset = dataset.map(prepare_data,batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/11368 [00:00<?, ? examples/s]

Map:   0%|          | 0/842 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
encoded_dataset.set_format("torch")

# MODEL BUILD

In [17]:
device='cuda'
model = DistilBertForSequenceClassification.from_pretrained(
    "content/models/distilbert-book-classifier/checkpoint-4977"
)
model = model.to(device)

In [9]:
args = TrainingArguments(
    f"TxMM/models/distilbert-book-classifier",
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10,
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)

In [40]:
def multi_label_metrics(predictions, labels):
    sigmoid = torch.nn.Sigmoid()
    probabilities = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probabilities.shape)
    y_true = labels
    y_pred[np.where(probabilities>=0.5)] = 1
    f1_micro = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc_micro = roc_auc_score(y_true=y_true, y_score=y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)

    f1_weight = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    roc_auc_weight = roc_auc_score(y_true=y_true, y_score=y_pred, average='weighted')

    metrics = {
        "f1":f1_micro,
        "roc_auc":roc_auc_micro,
        "accuracy":accuracy,
        "f1 weights":f1_weight,
        "roc weights":roc_auc_weight,
              }
    return metrics


def compute_metrics(p:EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,tuple) else p.predictions
    result= multi_label_metrics(
        predictions=preds,
        labels=p.label_ids
    )
    return result

In [18]:
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0).to(device), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0).to(device))
outputs

SequenceClassifierOutput(loss=tensor(0.1111, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-3.1344, -4.3958, -6.3208, -3.7092, -4.2933, -4.0962, -4.0419, -3.0307,
         -5.1862, -0.0478, -5.1422, -3.2470, -5.9466, -2.2737, -1.2610, -4.6415,
         -4.1033, -1.2608, -3.3578, -0.1504]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [41]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['valid'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [42]:
results = trainer.predict(encoded_dataset['test'])

In [49]:
results

PredictionOutput(predictions=array([[-4.601303  , -5.6358523 , -5.12385   , ..., -3.0735242 ,
        -5.060347  , -5.9021487 ],
       [-5.126658  , -2.610917  , -2.8949027 , ..., -0.11496418,
        -4.8210545 , -3.7965467 ],
       [-4.3129416 , -4.693411  , -6.271017  , ..., -1.3670057 ,
        -4.4582853 , -4.690391  ],
       ...,
       [-4.116854  , -1.6124333 , -5.3117094 , ..., -2.2691333 ,
        -3.4601767 , -3.9122982 ],
       [-4.7879944 , -1.201875  , -2.8339856 , ..., -3.3609045 ,
        -3.9086614 , -3.8135092 ],
       [-4.4846244 ,  2.0346642 , -5.1685324 , ..., -4.1326036 ,
        -0.5097189 , -3.387729  ]], dtype=float32), label_ids=array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32), metrics={'test_loss': 0.16784217953681946, 'test_f1': 0.4918032786885246, '

In [None]:
InterAceAge = "".join([
    "The future is forever a projection of the present. ",
    "Something whose connection with human experience we cannot grasp is bound to be frightening. ",
    "Perhaps there is no such thing as a cruel future. The future, properly speaking, is already cruel by virtue of being the future. The responsibility for this cruelty lies not on the side of the future, but on that of a present unable to accept the abyss that separates the two."
])

In [33]:
dataset['test'][0]

{'index': 14179,
 'name': 'Tales of a Fourth Grade Nothing',
 'author': 'Judy Blume',
 'text': 'on the rug. What’s next on your reading list? Discove. I thought how great it would be if we could trade in Fudge for a nice cocker spaniel. birthday party. And even more, I’m going to see to it that he’s happy. One night my father came home from the office all excited. He told us Mr. and Mrs. Yarby were coming to New York. He’s the president of the Juicy-O company. H. Hallowe’en. Fudge. If I decided not to eat they’d probably never even notice. dope-pushers hang around there. But taking dope is even dumber than smoking, so nobody’s going to hook me! We live o. He’s really fat. Wear or wrap. May tenth. How do, Peter, ” Mr. Yarby said. Mrs. Yarby just gave me a nod. windup train that made a lot of noise. Every time it bumped into something it turned around and went the other way. Fudge liked it a lot. He likes anything that’s noisy. Ralph arrived first. He’s really fat. And he isn’t even four