# IMDB Experiment with Full Data

## Transformer Installation

In [1]:
! pip install transformers datasets



## Load the Data

In [2]:
from datasets import load_dataset


In [3]:
raw_dataset = load_dataset("imdb")

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]



## Load the Tokenizer, if using pretrained model

In [4]:
from transformers import AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

## Use Map Method to apply padding, truncation

In [6]:
def tokenize_function(examples):
  return tokenizer(examples["text"], 
                   padding= "max_length",
                   truncation = True)

In [7]:
tokenized_datasets = raw_dataset.map(tokenize_function, batched = True)


Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-9ee40c0ee7fba4de.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-53cc57ece0ceae39.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-916d2fc1323341ac.arrow


# Split the data set into Train and Test

In [8]:
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

In [9]:
print(len(train_dataset))
print(len(test_dataset))

25000
25000


## Find a classification head to stick on top of pretrained model

In [10]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Check the dataset, keep the relevant columns. 
  - Token ID
  - Mask
  - Token_Type_IDs
  - Label

In [11]:
train_dataset_tf = train_dataset.remove_columns(["text"]).with_format("tensorflow")
test_dataset_tf = train_dataset.remove_columns(["text"]).with_format("tensorflow")

## Convert Everything to Big Tensors
- use tf.data.Dataset.from_tensor_slices

In [12]:
# for x in tokenizer.model_input_names:
#   print(train_dataset_tf[x])

In [13]:
train_features = {x: train_dataset_tf[x] for x in tokenizer.model_input_names}
train_dataset_tf = tf.data.Dataset.from_tensor_slices((train_features, train_dataset_tf["label"]))
train_dataset_tf = train_dataset_tf.shuffle(len(train_dataset_tf)).batch(8)

In [14]:
test_features = {x: test_dataset_tf[x] for x in tokenizer.model_input_names}
test_dataset_tf = tf.data.Dataset.from_tensor_slices((test_features, test_dataset_tf["label"]))
test_dataset_tf = test_dataset_tf.batch(8)

## Compile

In [15]:
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics = tf.metrics.SparseCategoricalAccuracy()

)

## Train

In [16]:
model_history= model.fit(train_dataset_tf,validation_data=test_dataset_tf, epochs =3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
# epoch 3 is overfitting. 