### Install Dependincies

In [None]:
!pip install --upgrade transformers
!pip install --upgrade accelerate
!pip install torch torchvision torchaudio

In [None]:
!pip show transformers
!pip show accelerate
!pip show torchvision
!pip show torchaudio

Name: transformers
Version: 4.40.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
Name: accelerate
Version: 0.30.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 
Name: torchvision
Version: 0.17.1+cu121
Summary: image and video datasets and models for torch deep le

### Downloading the Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Download the dataset
!gdown --id 1NdvIddoyYy2idsAWxJ8lodKfD-PZhmyL

Downloading...
From: https://drive.google.com/uc?id=1NdvIddoyYy2idsAWxJ8lodKfD-PZhmyL
To: /content/in_domain_train.tsv
100% 429k/429k [00:00<00:00, 145MB/s]


In [3]:
# Read into a pandas dataframe
df = pd.read_csv("in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])[['label', 'sentence']]
df.head()

Unnamed: 0,label,sentence
0,1,"Our friends won't buy this analysis, let alone..."
1,1,One more pseudo generalization and I'm giving up.
2,1,One more pseudo generalization or I'm giving up.
3,1,"The more we study verbs, the crazier they get."
4,1,Day by day the facts are getting murkier.


### Model Building, Training and Evaluation

In [4]:
import torch
from transformers import DistilBertTokenizer, AutoTokenizer
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [5]:
X = list(df.sentence)
y = list(df.label)

In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [7]:
set(y)

{0, 1}

In [8]:
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=len(set(y)), problem_type="binary_classification")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Lets build custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
      text = str(self.texts[idx])
      label = torch.tensor(self.labels[idx])

      encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')
      # encoding = self.tokenizer(text, truncation=True, padding=True, return_tensors='pt')

      return {
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': label,
          # 'loss': torch.tensor(0.0),   # Placeholder for loss calculation
      }

In [10]:
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def binary_classification_metrics(predictions, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    y_pred = (probs.numpy() >= threshold).astype(int)
    y_true = labels

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, probs.numpy())

    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc
    }

    return metrics



# def compute_metrics(p:EvalPrediction):
#     preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

#     if len(preds.shape) == 2 and preds.shape[1] > 1:
#         # Convert multilabel-indicator targets to single-label format
#         preds = np.argmax(preds, axis=1)
#     else:
#         preds = preds.squeeze()

#     result = binary_classification_metrics(predictions=preds,
#                                   labels=p.label_ids)

#     return result

In [26]:
import os

os.makedirs('/content/results', exist_ok=True)
os.makedirs('/content/logs', exist_ok=True)

In [45]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)  # Adjust patience as needed


# Define custom Trainer class with modified loss function
class TrainerWithCustomLoss(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


# Define evaluation metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


# Define TrainingArguments
args = TrainingArguments(
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    output_dir='/content/results',
    num_train_epochs=8,
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_dir='/content/logs',
)


# Initialize Trainer
trainer = TrainerWithCustomLoss(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)


# Train the model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1000,0.0141,1.506919,0.79135,0.815063,0.909318,0.859615


TrainOutput(global_step=1712, training_loss=0.015927132602049924, metrics={'train_runtime': 563.8797, 'train_samples_per_second': 97.042, 'train_steps_per_second': 3.036, 'total_flos': 1812154013614080.0, 'train_loss': 0.015927132602049924, 'epoch': 8.0})

In [46]:
trainer.evaluate()

{'eval_loss': 1.5069186687469482,
 'eval_accuracy': 0.7913500876680304,
 'eval_precision': 0.8150633855331841,
 'eval_recall': 0.9093178036605657,
 'eval_f1': 0.8596146283916634,
 'eval_runtime': 6.1243,
 'eval_samples_per_second': 279.378,
 'eval_steps_per_second': 8.817,
 'epoch': 8.0}

In [43]:
trainer.save_model("distilbert-finetuned-binary-classifier")

In [44]:
!zip -r /content/model3.zip /content/distilbert-finetuned-binary-classifier

  adding: content/distilbert-finetuned-binary-classifier/ (stored 0%)
  adding: content/distilbert-finetuned-binary-classifier/config.json (deflated 46%)
  adding: content/distilbert-finetuned-binary-classifier/model.safetensors (deflated 8%)
  adding: content/distilbert-finetuned-binary-classifier/training_args.bin (deflated 51%)


In [47]:
text = "Carol Danvers gets her powers entangled with those of Kamala Khan and Monica Rambeau, forcing them to work together to save the universe."

encoding = tokenizer(text, return_tensors='pt')
encoding.to(trainer.model.device)

outputs = trainer.model(**encoding)

In [48]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-4.3721,  3.9403]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [49]:
import numpy as np

sigmoid = torch.nn.Sigmoid()
probs = sigmoid(outputs.logits[0].cpu())  # Assuming outputs is the output of the model

# Define the threshold (0.3 in your case)
threshold = 0.3
preds = np.zeros(probs.shape)
preds[np.where(probs >= threshold)] = 1

# Convert predicted labels to class names (assuming class 0 and class 1)
class_names = ['Class 0' if pred == 0 else 'Class 1' for pred in preds]

print(class_names)

['Class 0', 'Class 1']


In [50]:
preds.reshape(1,-1)

array([[0., 1.]])