In [1]:
!pip install datasets
!pip install transformers
import torch
import numpy as np
import pandas as pd
import copy
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, DistilBertModel, DistilBertTokenizer
from datasets import load_dataset
from tqdm import tqdm
from transformers import pipeline
import torch.nn.functional as F


Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

In [2]:
num_classes = 3
is_textual_metadata = True

class HybridBERTModel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = torch.nn.Linear(774, 774)
        self.activation = torch.nn.Tanh()
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(774, num_classes)



    def forward(self, input_ids, attention_mask,  numerical_features, sentiment_score):
        distilbert_output = self.distilbert(input_ids = input_ids, attention_mask = attention_mask)
        hidden_state = distilbert_output[0]
        pooled_output = hidden_state[:, 0]
        combined_features = torch.cat((pooled_output, numerical_features, sentiment_score), dim = 1)
        pre_classifier_activated_output = self.activation((self.pre_classifier(combined_features)))
        dropout_output = self.dropout(pre_classifier_activated_output)
        logits = self.classifier(dropout_output)

        probs = F.softmax(logits, dim = 1)

        cum_probs = torch.cumsum(probs, dim = 1)

        return cum_probs

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
model = HybridBERTModel(num_classes)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

HybridBERTModel(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): L

In [5]:

dataset = load_dataset("liar")

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
sentiment_data = pd.read_csv("drive/MyDrive/sentiment_dataset.csv")
sentiment_mapping = dict(zip(sentiment_data['id'], sentiment_data['Sentiment']))

def preprocess_data(examples, num_classes, is_textual_metadata):

    def label_mapping(label_idx, num_classes):

      actual_idx = [1, 3, 4, 5, 2, 0]

      result = [float(0)] * num_classes

      i = (actual_idx[label_idx]) // (6 // num_classes)

      for idx in range(i, num_classes):
        result[idx] = float(1)

      return result


    labels = list(map(lambda elem: label_mapping(elem, num_classes), examples['label']))

    combined_text = [statement + " This is what " + speaker + ", a " + party_affiliation + ", said on " + subject + "."  for statement, speaker, subject, party_affiliation in zip(examples['statement'], examples['speaker'], examples['subject'], examples['party_affiliation'])]

    text_to_tokenize = combined_text if is_textual_metadata else examples['statement']

    tokenized = tokenizer(text_to_tokenize, padding= True, truncation=True, max_length=128, return_tensors = 'pt')

    sentiment_scores = [sentiment_mapping[statement_id] for statement_id in examples['id']]
    sentiment_scores = [[1 if s == 'POSITIVE' else 0] for s in sentiment_scores]

    numerical_features = torch.rand((len(examples['label']), 5)) # to exclude numerical features
    # numerical_features = torch.tensor([
    #     examples['barely_true_counts'],
    #     examples['false_counts'],
    #     examples['half_true_counts'],
    #     examples['mostly_true_counts'],
    #     examples['pants_on_fire_counts']
    # ]).t()

    return tokenized, numerical_features, sentiment_scores, labels

def create_dataset(dataset_split, num_classes, is_textual_metadata):
    tokenized,  numerical_features, sentiment_scores, labels = preprocess_data(dataset_split, num_classes, is_textual_metadata)
    dataset = TensorDataset(
        torch.tensor(tokenized['input_ids']),
        torch.tensor(tokenized['attention_mask']),
        torch.tensor(numerical_features),
        torch.tensor(sentiment_scores),
        torch.tensor(labels)
    )
    return dataset

train_dataset = create_dataset(dataset['train'], num_classes, is_textual_metadata)
val_dataset = create_dataset(dataset['validation'], num_classes, is_textual_metadata)
test_dataset = create_dataset(dataset['test'], num_classes, is_textual_metadata)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


optimizer = AdamW(model.parameters(), lr=5e-5)
# ['false', 'half-true', 'mostly-true', 'True', 'barely-true', 'pants-fire']


Downloading data:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/171k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/172k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1283 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1284 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  torch.tensor(tokenized['input_ids']),
  torch.tensor(tokenized['attention_mask']),
  torch.tensor(numerical_features),


In [6]:
class CustomBCELoss(nn.Module):
    def __init__(self):
        super(CustomBCELoss, self).__init__()

    def forward(self, outputs, targets):
        outputs = outputs.clamp(min= 0.0001, max= 0.9999)
        loss = -1 * (targets * torch.log(outputs) + (1 - targets) * torch.log(1 - outputs))
        return loss.mean()

loss_fn = CustomBCELoss()

In [7]:
best_val_accuracy = 0.0
best_model_state_dict = None

schedule = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma = 0.2)



for epoch in range(2):
    model.train()  # Set the model to train mode
    running_loss = 0
    # Training loop
    for batch in tqdm(train_loader):
        input_ids, attention_mask, numerical_features, sentiment_scores, labels = batch
        input_ids, attention_mask,  numerical_features, sentiment_scores, labels = input_ids.to(device), attention_mask.to(device), numerical_features.to(device), sentiment_scores.to(device), labels.to(device)

        outputs = model.forward(input_ids, attention_mask, numerical_features, sentiment_scores)

        optimizer.zero_grad()

        loss = loss_fn(outputs, labels)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()

    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss}")
    schedule.step()

    """
    # Validation loop
    model.eval()  # Set the model to evaluation mode
    total_val_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, numerical_features, sentiment_scores, labels = batch
            input_ids, attention_mask, numerical_features, sentiment_scores, labels = input_ids.to(device), attention_mask.to(device), numerical_features.to(device), sentiment_scores.to(device), labels.to(device)

            outputs = model.forward(input_ids, attention_mask, numerical_features, sentiment_scores)

            loss = nn.CrossEntropyLoss()(outputs, labels)
            total_val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)

    val_accuracy = correct_predictions / total_predictions
    avg_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch {epoch + 1}, Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}")

    # Save the model with the best validation accuracy
    if val_accuracy > best_val_accuracy:
        print("New best!")
        best_val_accuracy = val_accuracy
        # best_model_state_dict = copy.deepcopy(model.state_dict())
        best_model_state_dict = model.state_dict()
    # print(compare_state_dicts(best_model_state_dict, model.state_dict()))
    """




# Load the best model state_dict
#model.load_state_dict(best_model_state_dict)

100%|██████████| 642/642 [01:37<00:00,  6.57it/s]


Epoch 1, Loss: 0.40387439300709427


100%|██████████| 642/642 [01:42<00:00,  6.25it/s]

Epoch 2, Loss: 0.3541297197249077





In [None]:
"""
model_save_path = '/content/drive/MyDrive/news_bert_weights.pth'
torch.save(model.state_dict(), model_save_path)
"""

"\nmodel_save_path = '/content/drive/MyDrive/news_bert_weights.pth'\ntorch.save(model.state_dict(), model_save_path)\n"

In [None]:
#model.load_state_dict(torch.load('/content/drive/MyDrive/news_bert_weights.pth'))

In [8]:
import numpy as np

def get_probs(cums):
    result = [cums[0]]
    result.extend([cums[i] - cums[i - 1] for i in range(1, len(cums))])
    return result

def evaluate_model(model, testing_loader):
    model.eval()
    all_outputs, all_targets = [], []

    with torch.no_grad():
        for batch in testing_loader:
            input_ids, attention_mask, numerical_features, sentiment_scores, targets = batch
            input_ids, attention_mask, numerical_features, sentiment_scores, targets = input_ids.to(device), attention_mask.to(device), numerical_features.to(device), sentiment_scores.to(device), targets.to(device)

            outputs = model(input_ids, attention_mask, numerical_features, sentiment_scores)

            output_probs = [get_probs(output) for output in outputs]
            target_probs = [get_probs(target) for target in targets]

            output_vals = torch.argmax(torch.tensor(output_probs), dim = 1)
            target_vals = torch.argmax(torch.tensor(target_probs), dim = 1)
            all_outputs.append(output_vals)
            all_targets.append(target_vals)

    all_outputs = torch.cat(all_outputs, dim = 0)
    all_targets = torch.cat(all_targets, dim = 0)

    accuracy = (all_outputs == all_targets).float().mean().item()

    return all_outputs.float(), all_targets.float(), accuracy

def weighted_ordinal_accuracy(y_true, y_pred, weight=0.5):

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    abs_diff = np.abs(y_true - y_pred)
    mask = (abs_diff == 1) * weight
    weighted_diff = np.dot(abs_diff, mask)
    accuracy = (weighted_diff + np.sum(y_true == y_pred)) / len(y_true)
    return accuracy

def mean_absolute_error(y_true, y_pred):
    return torch.mean(torch.abs(y_true - y_pred)).item()

def within_one_accuracy(y_true, y_pred):
    return torch.mean((torch.abs(y_true - y_pred) <= 1).float()).item()



training_accuracy = evaluate_model(model, train_loader)[2]
y_true, y_pred, accuracy = evaluate_model(model, val_loader)
ordinal_accuracy = weighted_ordinal_accuracy(y_true, y_pred)
mean_absolute_error = mean_absolute_error(y_true, y_pred)
within_one_accuracy = within_one_accuracy(y_true, y_pred)

print("Training Accuracy", training_accuracy)
print("Validation Accuracy:", accuracy)
print('Validation Ordinal Accuracy:', ordinal_accuracy)
print('Validation Mean Absolute Error:', mean_absolute_error)
print('Within 1 Accuracy:', within_one_accuracy)

test_loader = DataLoader(test_dataset, batch_size = 16)
y_true, y_pred, test_accuracy = evaluate_model(model, test_loader)
test_ordinal_accuracy = weighted_ordinal_accuracy(y_true, y_pred)

print("Testing Accuracy:", test_accuracy)
print('Testing Ordinal Accuracy:', test_ordinal_accuracy)

Training Accuracy 0.6043431758880615
Validation Accuracy: 0.48520249128341675
Validation Ordinal Accuracy: 0.6787383177570093
Validation Mean Absolute Error: 0.6425233483314514
Within 1 Accuracy: 0.8722741603851318
Testing Accuracy: 0.47778645157814026
Testing Ordinal Accuracy: 0.6792673421667965
