In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

In [None]:
# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

In [None]:
# Convert numeric features to synthetic textual features
textual_features = [f"Sepal length is {x[0]}, sepal width is {x[1]}, petal length is {x[2]}, petal width is {x[3]}" for x in X]
# textual_features

In [None]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(textual_features, y, test_size=0.2, random_state=42)

In [None]:
# Tokenize textual features
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(X_train, truncation=False, padding=True)
test_encodings = tokenizer(X_test, truncation=False, padding=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
X_train[:5]

['Sepal length is 4.6, sepal width is 3.6, petal length is 1.0, petal width is 0.2',
 'Sepal length is 5.7, sepal width is 4.4, petal length is 1.5, petal width is 0.4',
 'Sepal length is 6.7, sepal width is 3.1, petal length is 4.4, petal width is 1.4',
 'Sepal length is 4.8, sepal width is 3.4, petal length is 1.6, petal width is 0.2',
 'Sepal length is 4.4, sepal width is 3.2, petal length is 1.3, petal width is 0.2']

In [None]:
for i in train_encodings['input_ids'][:5]:
  print(i)

[101, 19802, 2389, 3091, 2003, 1018, 1012, 1020, 1010, 19802, 2389, 9381, 2003, 1017, 1012, 1020, 1010, 9004, 2389, 3091, 2003, 1015, 1012, 1014, 1010, 9004, 2389, 9381, 2003, 1014, 1012, 1016, 102]
[101, 19802, 2389, 3091, 2003, 1019, 1012, 1021, 1010, 19802, 2389, 9381, 2003, 1018, 1012, 1018, 1010, 9004, 2389, 3091, 2003, 1015, 1012, 1019, 1010, 9004, 2389, 9381, 2003, 1014, 1012, 1018, 102]
[101, 19802, 2389, 3091, 2003, 1020, 1012, 1021, 1010, 19802, 2389, 9381, 2003, 1017, 1012, 1015, 1010, 9004, 2389, 3091, 2003, 1018, 1012, 1018, 1010, 9004, 2389, 9381, 2003, 1015, 1012, 1018, 102]
[101, 19802, 2389, 3091, 2003, 1018, 1012, 1022, 1010, 19802, 2389, 9381, 2003, 1017, 1012, 1018, 1010, 9004, 2389, 3091, 2003, 1015, 1012, 1020, 1010, 9004, 2389, 9381, 2003, 1014, 1012, 1016, 102]
[101, 19802, 2389, 3091, 2003, 1018, 1012, 1018, 1010, 19802, 2389, 9381, 2003, 1017, 1012, 1016, 1010, 9004, 2389, 3091, 2003, 1015, 1012, 1017, 1010, 9004, 2389, 9381, 2003, 1014, 1012, 1016, 102]


In [None]:
# Convert labels to tensors
train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)

In [None]:
y_train

array([0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2,
       1, 0, 2, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2,
       1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1,
       0, 0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2,
       1, 1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2,
       1, 1, 2, 2, 0, 1, 2, 0, 1, 2])

In [None]:
type(train_labels)

torch.Tensor

In [None]:
# Create datasets
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              train_labels)
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             test_labels)

In [None]:
train_dataset[:3]

(tensor([[  101, 19802,  2389,  3091,  2003,  1018,  1012,  1020,  1010, 19802,
           2389,  9381,  2003,  1017,  1012,  1020,  1010,  9004,  2389,  3091,
           2003,  1015,  1012,  1014,  1010,  9004,  2389,  9381,  2003,  1014,
           1012,  1016,   102],
         [  101, 19802,  2389,  3091,  2003,  1019,  1012,  1021,  1010, 19802,
           2389,  9381,  2003,  1018,  1012,  1018,  1010,  9004,  2389,  3091,
           2003,  1015,  1012,  1019,  1010,  9004,  2389,  9381,  2003,  1014,
           1012,  1018,   102],
         [  101, 19802,  2389,  3091,  2003,  1020,  1012,  1021,  1010, 19802,
           2389,  9381,  2003,  1017,  1012,  1015,  1010,  9004,  2389,  3091,
           2003,  1018,  1012,  1018,  1010,  9004,  2389,  9381,  2003,  1015,
           1012,  1018,   102]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7c5539fb39d0>

In [None]:
# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3 classes in Iris dataset

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=10e-5)



In [None]:
# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
num_epochs = 3  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3 classes in Iris dataset

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
num_epochs = 3  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation loop
model.eval()
with torch.no_grad():
    test_preds = []
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())

# Calculate accuracy
test_preds = np.array(test_preds)
test_accuracy = np.mean(test_preds == y_test)
print('Test Accuracy:', test_accuracy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy: 0.6333333333333333
