In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


DATA_PATH = "../data/final_tweets/"

train_df = pd.read_csv(DATA_PATH +'train_df.csv')
valid_df = pd.read_csv(DATA_PATH +'validate_df.csv')
test_df  = pd.read_csv(DATA_PATH + 'test_df.csv')


In [3]:
X_train = train_df['tweet_text']
Y_train = train_df['text_info']

X_valid = valid_df['tweet_text']
Y_valid = valid_df['text_info']

X_test = test_df['tweet_text']
Y_test = test_df['text_info']

train_texts = list(X_train.values)
val_texts = list(X_valid.values)
test_texts = list(X_test.values)

In [4]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_labels = Y_train
val_labels   = Y_valid
test_labels  = Y_test

In [5]:
import torch

class TweetsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetsDataset(train_encodings, train_labels)
val_dataset = TweetsDataset(val_encodings, val_labels)
test_dataset = TweetsDataset(test_encodings, test_labels)

In [6]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

pos_weight = torch.tensor([1.48, 0.75])

criterion  = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)


def my_custom_loss(logits,labels):
    return criterion(logits,labels)
    
from transformers import Trainer
class MyTrainer(Trainer):
    def compute_loss(self, model, inputs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        return my_custom_loss(logits, labels)

    
trainer = MyTrainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=559.0), HTML(value='')))






HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=559.0), HTML(value='')))




HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=559.0), HTML(value='')))





TrainOutput(global_step=1677, training_loss=0.2901203105591024)

In [7]:
trainer.evaluate()

HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=30.0), HTML(value='')))




{'eval_loss': 0.5215925373757879, 'epoch': 3.0}

In [8]:
predictions = trainer.predict(test_dataset)

HBox(children=(HTML(value='Prediction'), FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [15]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [20]:
p = torch.tensor(predictions[0])

In [26]:
p

tensor([[-2.6971,  2.2211],
        [-2.8412,  2.3875],
        [-2.4308,  2.0391],
        ...,
        [-2.5984,  2.1481],
        [ 3.1475, -3.2848],
        [ 2.3923, -2.5704]])

In [21]:
m = nn.Softmax(dim=1)
output = m(p)

In [27]:
output

tensor([[0.0073, 0.9927],
        [0.0053, 0.9947],
        [0.0113, 0.9887],
        ...,
        [0.0086, 0.9914],
        [0.9984, 0.0016],
        [0.9931, 0.0069]])

In [29]:
pred = torch.argmax(output,dim=1)

In [31]:
pred[0:10]

tensor([1, 1, 1, 1, 0, 1, 0, 0, 1, 0])

In [33]:
test_labels[0:10]

0    1
1    1
2    1
3    1
4    0
5    1
6    0
7    0
8    1
9    0
Name: text_info, dtype: int64

In [36]:
from sklearn.metrics import classification_report

print(classification_report(test_labels, pred, target_names=["INF", "NON-INF"]))

              precision    recall  f1-score   support

         INF       0.86      0.73      0.79       646
     NON-INF       0.87      0.94      0.90      1269

    accuracy                           0.87      1915
   macro avg       0.86      0.83      0.85      1915
weighted avg       0.87      0.87      0.86      1915



In [14]:
output

tensor([[0.2778, 0.2556, 0.4666],
        [0.1597, 0.4019, 0.4384]])

In [25]:
criterion(p,output)

tensor(0.1550)

In [25]:
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [26]:

for batch in test_loader:

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    break


In [27]:
labels

tensor([1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], device='cuda:0')

In [28]:
outputs

(tensor(0.8237, device='cuda:0', grad_fn=<NllLossBackward>),
 tensor([[-2.3503,  2.3772],
         [-2.9966,  3.0601],
         [ 1.2628, -1.6083],
         [-0.2565,  0.2868],
         [-0.5624,  0.6006],
         [-1.2148,  1.2351],
         [-2.5976,  2.6902],
         [-2.8018,  2.8395],
         [-3.0181,  3.1259],
         [-2.6815,  2.6519],
         [-2.2761,  2.2943],
         [-3.1456,  3.1272],
         [-1.2787,  1.2625],
         [-2.7021,  2.7371],
         [ 1.3137, -1.5750],
         [ 2.9227, -3.4298]], device='cuda:0', grad_fn=<AddmmBackward>))