# Movie Genre Classification with BERT on Kaggle T4

This notebook implements movie genre classification using BERT transformer model, optimized for Kaggle T4 GPU environment.

In [1]:
# Install required dependencies for Kaggle T4 environment
!pip install transformers[torch] datasets tokenizers accelerate -q
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 -q

# Check GPU availability
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU device name:", torch.cuda.get_device_name(0))
    print("GPU memory:", torch.cuda.get_device_properties(0).total_memory / 1024**3, "GB")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
import torch
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments



2025-06-29 13:11:18.814624: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751202679.026521      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751202679.091202      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [12]:
# Load training data (adjust path for Kaggle dataset)
# For Kaggle: Upload dataset and use: "/kaggle/input/your-dataset-name/train_data.txt"
train_file_path = "/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt"
# Fallback for local testing

print(f"Loading training data from: {train_file_path}")

with open(train_file_path, "r", encoding='utf-8') as file:
    lines = file.readlines()

target = []
parts = []
for i in range(len(lines)):
    lines[i] = lines[i].strip()
    parts = lines[i].split(":::")
      # Safety check
    target.append(parts[2].strip())
    lines[i] = parts[3].strip()
    




Loading training data from: /kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt


In [13]:
le=LabelEncoder()
y=le.fit_transform(target)

In [14]:
df = pd.DataFrame({
    "text": lines,
    "label": y
})

In [15]:
train_dataset = Dataset.from_pandas(df)


In [16]:
import os

# Load test data (adjust path for Kaggle dataset)
test_file_path = "/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt"
test_solution_path = "/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data_solution.txt"

# Fallback for local testing

print(f"Loading test data from: {test_file_path}")

with open(test_file_path, "r", encoding='utf-8') as file:
    test_lines = file.readlines()

for i in range(len(test_lines)):
    test_lines[i] = test_lines[i].strip()
    parts = test_lines[i].split(":::")
    if len(parts) >= 3:
        test_lines[i] = parts[2].strip()

with open(test_solution_path, "r", encoding='utf-8') as file:
    test_target = file.readlines()

test_target = [line.strip() for line in test_target]
for i in range(len(test_target)):
    parts = test_target[i].split(":::")
    if len(parts) >= 3:
        test_target[i] = parts[2].strip()

y_test = le.transform(test_target)
print(f"Loaded {len(test_lines)} test samples")


Loading test data from: /kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt
Loaded 54200 test samples


In [17]:
test_df = pd.DataFrame({
    "text": test_lines,
    "label": y_test
})

In [18]:
test_dataset = Dataset.from_pandas(test_df)

In [19]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [20]:
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/54214 [00:00<?, ? examples/s]

Map:   0%|          | 0/54200 [00:00<?, ? examples/s]

In [21]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

num_labels = len(le.classes_)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
# Optimized training arguments for Kaggle T4 GPU
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    per_device_train_batch_size=16,  # Increased for T4
    per_device_eval_batch_size=32,   # Larger for evaluation
    num_train_epochs=3,              # Reduced for Kaggle time limits
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=100,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,                       # Enable mixed precision for T4
    dataloader_num_workers=2,        # Optimize for Kaggle
    save_total_limit=2,              # Save space
    report_to="wandb"
)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)



In [27]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

CUDA available: True
GPU: Tesla T4


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
Aborted!


In [43]:
import wandb
wandb.login(key="ab11e1d1e666be11c522a1a2a04dcb2acd82bd07")

wandb.init(project="bert-genre-classification", name="bert-run-1")

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnihcas2015[0m ([33mnihcas2015-vellore-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [44]:
trainer.train()

eval_result = trainer.evaluate()
print("Evaluation Results:", eval_result)

Step,Training Loss
50,2.3122
100,2.1406
150,1.9401
200,1.7362
250,1.6631
300,1.6714
350,1.5466
400,1.5269
450,1.4275
500,1.4182




Evaluation Results: {'eval_loss': 1.098167061805725, 'eval_accuracy': 0.6771771217712177, 'eval_precision': 0.6579181211779688, 'eval_recall': 0.6771771217712177, 'eval_f1': 0.6600026787573379, 'eval_runtime': 450.9146, 'eval_samples_per_second': 120.2, 'eval_steps_per_second': 1.878, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
# Memory optimization for Kaggle
import gc
import torch

# Clear cache before training
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"GPU memory allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
    print(f"GPU memory cached: {torch.cuda.memory_reserved()/1024**3:.2f} GB")

GPU memory allocated: 2.16 GB
GPU memory cached: 2.52 GB


In [45]:
# Save the model for Kaggle output
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")

print("Model saved to ./final_model")

# Generate predictions for submission
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)
predicted_genres = le.inverse_transform(predicted_labels)

# Create submission file
submission_df = pd.DataFrame({
    'test_id': range(len(predicted_genres)),
    'predicted_genre': predicted_genres
})

submission_df.to_csv('submission.csv', index=False)
print(f"Submission file saved with {len(predicted_genres)} predictions")

# Display final metrics
print("\nFinal Model Performance:")
print(f"Test Accuracy: {eval_result['eval_accuracy']:.4f}")
print(f"Test F1-Score: {eval_result['eval_f1']:.4f}")
print(f"Test Precision: {eval_result['eval_precision']:.4f}")
print(f"Test Recall: {eval_result['eval_recall']:.4f}")

Model saved to ./final_model


  _warn_prf(average, modifier, msg_start, len(result))


Submission file saved with 54200 predictions

Final Model Performance:
Test Accuracy: 0.6772
Test F1-Score: 0.6600
Test Precision: 0.6579
Test Recall: 0.6772
