In [None]:
import pandas as pd
import numpy as np
import torch

from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [None]:
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

print(train_df.head())
print(train_df.shape, test_df.shape)


                                                text  label
0  Scaramucci awaits U.S. approval for China deal...      1
1  China grants economic aid to Djibouti, site of...      1
2   Russia Probe’s New Leader Disqualified Himsel...      0
3  Airbus issues safety advice on Tiger helicopte...      1
4  Influential Shi'ite cleric Sadr says Americans...      1
(35911, 2) (7299, 2)


In [None]:
train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)


In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_ds = train_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

train_ds = train_ds.rename_column("label", "labels")
test_ds  = test_ds.rename_column("label", "labels")

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/35911 [00:00<?, ? examples/s]

Map:   0%|          | 0/7299 [00:00<?, ? examples/s]

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0046,0.010129,0.998767,0.998849,0.998561,0.998705
2,0.0005,0.010651,0.998767,0.998562,0.998849,0.998705


TrainOutput(global_step=4490, training_loss=0.006150784144356415, metrics={'train_runtime': 1744.8524, 'train_samples_per_second': 41.162, 'train_steps_per_second': 2.573, 'total_flos': 4757036753135616.0, 'train_loss': 0.006150784144356415, 'epoch': 2.0})

In [None]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.010650711134076118, 'eval_accuracy': 0.998766954377312, 'eval_precision': 0.9985615650172612, 'eval_recall': 0.9988489208633093, 'eval_f1': 0.998705222270177, 'eval_runtime': 57.5449, 'eval_samples_per_second': 126.84, 'eval_steps_per_second': 7.942, 'epoch': 2.0}


In [None]:
model.save_pretrained("saved_model")
tokenizer.save_pretrained("saved_model")

print(" DistilBERT model saved successfully")


 DistilBERT model saved successfully


In [None]:
!zip -r saved_model.zip saved_model


  adding: saved_model/ (stored 0%)
  adding: saved_model/special_tokens_map.json (deflated 42%)
  adding: saved_model/tokenizer_config.json (deflated 75%)
  adding: saved_model/config.json (deflated 45%)
  adding: saved_model/vocab.txt (deflated 53%)
  adding: saved_model/model.safetensors (deflated 8%)
  adding: saved_model/tokenizer.json (deflated 71%)
