<a href="https://colab.research.google.com/github/okrazc/BERTCLass/blob/develop/BERTClassifMoviesGPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# In Google Colab or local Jupyter Notebook
!pip install transformers[torch] datasets torch -U
!pip install accelerate -U
!pip install tqdm
!pip install torch
!pip install datasets

Collecting transformers[torch]
  Downloading transformers-4.42.4-py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Collecting torch
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [

In [2]:
import os
import logging
import torch
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Training on GPU.")
else:
    device = torch.device("cpu")
    print("GPU is not available. Training on CPU.")

# Ensure the directories exist
results_dir = './results'
logs_dir = './logs'
os.makedirs(results_dir, exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"Directories '{results_dir}' and '{logs_dir}' are created.")


GPU is available. Training on GPU.


In [5]:
from datasets import load_dataset
from transformers import BertTokenizer

# Load the IMDb dataset
dataset = load_dataset('imdb')

# Access, shuffle, and select a subset of the training data
train_data = dataset['train'].shuffle(seed=42).select(range(5000))
test_data = dataset['test'].shuffle(seed=42).select(range(1000))

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_data = train_data.map(tokenize_data, batched=True)
test_data = test_data.map(tokenize_data, batched=True)
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [18]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
# Define the function to compute metrics
def compute_metrics(p):
    pred, labels = p
    pred = pred.argmax(axis=1)
    acc = accuracy_score(labels, pred)
    f1 = f1_score(labels, pred, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Load a pre-trained BERT model for sequence classification with 2 labels (binary classification)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Define training arguments with logging steps
# training_args = TrainingArguments(
#     output_dir=results_dir,           # Directory to save model checkpoints and logs
#     num_train_epochs=1,               # Number of training epochs
#     per_device_train_batch_size=8,    # Batch size for training
#     per_device_eval_batch_size=16,    # Batch size for evaluation
#     eval_strategy='epoch',            # Evaluate at the end of each epoch
#     logging_dir=logs_dir,             # Directory to save logs
#     logging_steps=10,                 # Log every 10 steps
#     report_to="none",                  # To prevent logging to external services
#     # Enable GPU usage
#     fp16=True if torch.cuda.is_available() else False,
# )
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # Increase the number of epochs
    per_device_train_batch_size=16,  # Experiment with different batch sizes
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    logging_dir='./logs',
    fp16=True if torch.cuda.is_available() else False,
    learning_rate=2e-5,  # Experiment with different learning rates
    weight_decay=0.01,  # Apply weight decay
)


# Initialize the Trainer
trainer = Trainer(
    model=model,                      # The pre-trained BERT model
    args=training_args,               # Training arguments
    train_dataset=train_data,         # Training dataset
    eval_dataset=test_data,           # Evaluation dataset
    compute_metrics=compute_metrics
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Start fine-tuning the model
logger.info("Starting training...")
trainer.train()
logger.info("Training complete.")


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.236753,0.909,0.90898
2,0.287000,0.283022,0.92,0.920009
3,0.287000,0.342489,0.919,0.919002


In [20]:

# Evaluate the model
trainer.evaluate(eval_dataset=test_data, metric_key_prefix="eval")




{'eval_loss': 0.3424893319606781,
 'eval_accuracy': 0.919,
 'eval_f1': 0.9190018639860487,
 'eval_runtime': 7.5138,
 'eval_samples_per_second': 133.088,
 'eval_steps_per_second': 4.259,
 'epoch': 3.0}

In [33]:
# Implement core functionality for prediction
def predict_sentiment(text, model, tokenizer, device):
    model.to(device)
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return predictions.argmax().item()

# Example usage
text = "The movie was nonsense!"
sentiment = predict_sentiment(text, model, tokenizer, device)
print(f'Sentiment: {"Positive" if sentiment == 1 else "Negative"}')

Sentiment: Negative
