In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any

In [None]:
import tensorflow as tf
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import numpy as np

In [None]:
df = load_dataset('imdb')

In [None]:
df

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
df.shape

{'train': (25000, 2), 'test': (25000, 2), 'unsupervised': (50000, 2)}

In [None]:
texts = df['train']['text'][:100]
labels = df['train']['label'][:100]

In [None]:
len(texts), len(labels)

(100, 100)

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size = 0.2, random_state = 42)
val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, test_size = 0.5, random_state = 42)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
train_encodings = tokenizer(train_texts, truncation = True, padding = True)
val_encodings = tokenizer(val_texts, truncation = True, padding = True)
test_encodings = tokenizer(test_texts, truncation = True, padding = True)

In [None]:
train_df = tf.data.Dataset.from_tensor_slices(({key: train_encodings[key] for key in ['input_ids', 'attention_mask']}, train_labels)).shuffle(10).batch(1)
val_df = tf.data.Dataset.from_tensor_slices(({key: val_encodings[key] for key in ['input_ids', 'attention_mask']}, val_labels)).batch(1)
test_df = tf.data.Dataset.from_tensor_slices(({key: test_encodings[key] for key in ['input_ids', 'attention_mask']}, test_labels)).batch(1)

In [None]:
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

In [None]:
loss_fn = SparseCategoricalCrossentropy(from_logits = True)

In [None]:
bert_model.compile(loss = loss_fn, optimizer = 'Adam', metrics = ['accuracy'])

In [None]:
history = bert_model.fit(train_df, validation_data = val_df, epochs = 3)

In [None]:
test_loss, test_accuracy = bert_model.evaluate(test_df)
print('Test Loss: ', test_loss)
print('Test Accuracy: ', test_accuracy)