In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-a

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

In [None]:
# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

In [None]:
# Convert numeric features to synthetic textual features
textual_features = [f"Sepal length is {x[0]}, sepal width is {x[1]}, petal length is {x[2]}, petal width is {x[3]}" for x in X]
# textual_features

In [None]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(textual_features, y, test_size=0.2, random_state=42)


In [None]:
# Tokenize textual features
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(X_train, truncation=False, padding=True)
test_encodings = tokenizer(X_test, truncation=False, padding=True)

In [None]:
X_train[:5]

['Sepal length is 4.6, sepal width is 3.6, petal length is 1.0, petal width is 0.2',
 'Sepal length is 5.7, sepal width is 4.4, petal length is 1.5, petal width is 0.4',
 'Sepal length is 6.7, sepal width is 3.1, petal length is 4.4, petal width is 1.4',
 'Sepal length is 4.8, sepal width is 3.4, petal length is 1.6, petal width is 0.2',
 'Sepal length is 4.4, sepal width is 3.2, petal length is 1.3, petal width is 0.2']

In [None]:
#train_encodings = tokenizer(['sepal','sepal length is 2'], truncation=False, padding=True)
#train_encodings['attention_mask']

In [None]:
for i in train_encodings['input_ids'][:5]:
  print(i)

[101, 19802, 2389, 3091, 2003, 1018, 1012, 1020, 1010, 19802, 2389, 9381, 2003, 1017, 1012, 1020, 1010, 9004, 2389, 3091, 2003, 1015, 1012, 1014, 1010, 9004, 2389, 9381, 2003, 1014, 1012, 1016, 102]
[101, 19802, 2389, 3091, 2003, 1019, 1012, 1021, 1010, 19802, 2389, 9381, 2003, 1018, 1012, 1018, 1010, 9004, 2389, 3091, 2003, 1015, 1012, 1019, 1010, 9004, 2389, 9381, 2003, 1014, 1012, 1018, 102]
[101, 19802, 2389, 3091, 2003, 1020, 1012, 1021, 1010, 19802, 2389, 9381, 2003, 1017, 1012, 1015, 1010, 9004, 2389, 3091, 2003, 1018, 1012, 1018, 1010, 9004, 2389, 9381, 2003, 1015, 1012, 1018, 102]
[101, 19802, 2389, 3091, 2003, 1018, 1012, 1022, 1010, 19802, 2389, 9381, 2003, 1017, 1012, 1018, 1010, 9004, 2389, 3091, 2003, 1015, 1012, 1020, 1010, 9004, 2389, 9381, 2003, 1014, 1012, 1016, 102]
[101, 19802, 2389, 3091, 2003, 1018, 1012, 1018, 1010, 19802, 2389, 9381, 2003, 1017, 1012, 1016, 1010, 9004, 2389, 3091, 2003, 1015, 1012, 1017, 1010, 9004, 2389, 9381, 2003, 1014, 1012, 1016, 102]


In [None]:
# Convert labels to tensors
train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)

In [None]:
y_train

array([0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2,
       1, 0, 2, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2,
       1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1,
       0, 0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2,
       1, 1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2,
       1, 1, 2, 2, 0, 1, 2, 0, 1, 2])

In [None]:
type(train_labels)

torch.Tensor

In [None]:
# Create datasets
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              train_labels)
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             test_labels)

In [None]:
train_dataset[:3]

(tensor([[  101, 19802,  2389,  3091,  2003,  1018,  1012,  1020,  1010, 19802,
           2389,  9381,  2003,  1017,  1012,  1020,  1010,  9004,  2389,  3091,
           2003,  1015,  1012,  1014,  1010,  9004,  2389,  9381,  2003,  1014,
           1012,  1016,   102],
         [  101, 19802,  2389,  3091,  2003,  1019,  1012,  1021,  1010, 19802,
           2389,  9381,  2003,  1018,  1012,  1018,  1010,  9004,  2389,  3091,
           2003,  1015,  1012,  1019,  1010,  9004,  2389,  9381,  2003,  1014,
           1012,  1018,   102],
         [  101, 19802,  2389,  3091,  2003,  1020,  1012,  1021,  1010, 19802,
           2389,  9381,  2003,  1017,  1012,  1015,  1010,  9004,  2389,  3091,
           2003,  1018,  1012,  1018,  1010,  9004,  2389,  9381,  2003,  1015,
           1012,  1018,   102]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7a0466b91cc0>

In [None]:
# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3 classes in Iris dataset

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=10e-5)




In [None]:
# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
num_epochs = 3  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3 classes in Iris dataset

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
num_epochs = 3  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation loop
model.eval()
with torch.no_grad():
    test_preds = []
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())

# Calculate accuracy
test_preds = np.array(test_preds)
test_accuracy = np.mean(test_preds == y_test)
print('Test Accuracy:', test_accuracy)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy: 0.7


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import load_dataset

# Load IMDb dataset from Hugging Face datasets library
dataset = load_dataset('imdb')

# Preprocess dataset
texts = dataset['train']['text'][:10000]
labels = dataset['train']['label'][:10000]

# Split dataset into train, validation, and test sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, test_size=0.5, random_state=42)

# Tokenize texts
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

# Create datasets
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                               torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            val_labels)
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             test_labels)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
num_epochs = 3  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation loop
model.eval()
with torch.no_grad():
    test_preds = []
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())

# Calculate accuracy
test_preds = np.array(test_preds)
test_accuracy = np.mean(test_preds == test_labels.numpy())
print('Test Accuracy:', test_accuracy)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

  torch.tensor(train_labels))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [None]:


1000 - 600 (train)
     - 400 (test) - 200 (val)
                  - 200 (test)

IndentationError: unexpected indent (<ipython-input-3-585dd18517b1>, line 2)

In [None]:
pip install tensorflow==2.5


[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.5 (from versions: 2.8.0rc0, 2.8.0rc1, 2.8.0, 2.8.1, 2.8.2, 2.8.3, 2.8.4, 2.9.0rc0, 2.9.0rc1, 2.9.0rc2, 2.9.0, 2.9.1, 2.9.2, 2.9.3, 2.10.0rc0, 2.10.0rc1, 2.10.0rc2, 2.10.0rc3, 2.10.0, 2.10.1, 2.11.0rc0, 2.11.0rc1, 2.11.0rc2, 2.11.0, 2.11.1, 2.12.0rc0, 2.12.0rc1, 2.12.0, 2.12.1, 2.13.0rc0, 2.13.0rc1, 2.13.0rc2, 2.13.0, 2.13.1, 2.14.0rc0, 2.14.0rc1, 2.14.0, 2.14.1, 2.15.0rc0, 2.15.0rc1, 2.15.0, 2.15.0.post1, 2.15.1, 2.16.0rc0, 2.16.1)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow==2.5[0m[31m
[0m

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import load_dataset

In [None]:


# Load IMDb dataset from Hugging Face datasets library
dataset = load_dataset('imdb')

# Preprocess dataset
texts = dataset['train']['text'][:100]
labels = dataset['train']['label'][:100]

# Split dataset into train, validation, and test sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, test_size=0.5, random_state=42)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Define TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {key: train_encodings[key] for key in ['input_ids', 'attention_mask']},
    train_labels
)).shuffle(10).batch(1)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {key: val_encodings[key] for key in ['input_ids', 'attention_mask']},
    val_labels
)).batch(1)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {key: test_encodings[key] for key in ['input_ids', 'attention_mask']},
    test_labels
)).batch(1)

# Load pre-trained BERT model
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Define optimizer and loss function
optimizer = Adam(learning_rate=5e-5)
loss_fn = SparseCategoricalCrossentropy(from_logits=True)

# Compile model
bert_model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])
# bert_model.compile(optimizer='adam',
              # loss=tf.losses.BinaryCrossentropy(),
              # metrics=[tf.metrics.BinaryAccuracy(threshold=0.0, name='accuracy')])

# Train model
history = bert_model.fit(train_dataset, validation_data=val_dataset, epochs=3)

# Evaluate model
test_loss, test_accuracy = bert_model.evaluate(test_dataset)
print('Test Accuracy:', test_accuracy)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Test Accuracy: 1.0


# New Section

In [None]:
import transformers

print(transformers.__version__)


In [None]:
pip install --upgrade tensorflow==2.15.1


Collecting tensorflow==2.15.1
  Downloading tensorflow-2.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.16,>=2.15 (from tensorflow==2.15.1)
  Downloading tensorboard-2.15.2-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
Collecting keras<2.16,>=2.15.0 (from tensorflow==2.15.1)
  Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras, tensorboard, tensorflow
  Attempting uninstall: keras
    Found existing installation: keras 3.3.3
    Uninstalling keras-3.3.3:
      Successfully uninstalled keras-3.3.3
  Attempting uninstall: tensorboard
    Found existing installation: