In [None]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# load data
data_path = '/content/drive/My Drive/emotions.csv'  # Update this path to your actual file location
df = pd.read_csv(data_path)

# convert labels to numeric if they're categorical
label_mapping = {label: idx for idx, label in enumerate(df['label'].unique())}
df['label'] = df['label'].map(label_mapping)

# Load data into Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-4_H-512_A-8')
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Setting up training and testing split
train_dataset, test_dataset = tokenized_dataset.train_test_split(test_size=0.1).values()

# Load a smaller BERT variant for sequence classification
model = BertForSequenceClassification.from_pretrained('google/bert_uncased_L-4_H-512_A-8', num_labels=len(label_mapping))

# Define training arguments
training_args = TrainingArguments
    output_dir='/content/drive/My Drive/results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Map:   0%|          | 0/416809 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/116M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-512_A-8 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
