In [1]:
import os
import pandas as pd
from transformers import BertTokenizer
import torch

In [2]:
import transformers
import accelerate
import torch

print(transformers.__version__)
print(accelerate.__version__)
print(torch.__version__)

4.44.2
0.34.2
2.3.0


In [3]:
train_pos_folder = './data/aclImdb/train/pos'
train_neg_folder = './data/aclImdb/train/neg'

train_pos_sentences = [open(os.path.join(train_pos_folder, f)).read().strip() for f in os.listdir(train_pos_folder)]
train_neg_sentences = [open(os.path.join(train_neg_folder, f)).read().strip() for f in os.listdir(train_neg_folder)]

train_df = pd.DataFrame({
    'text': train_pos_sentences + train_neg_sentences,
    'label': [1] * len(train_pos_sentences) + [0] * len(train_neg_sentences)  # 1 for positive, 0 for negative
})

In [4]:
test_pos_folder = './data/aclImdb/test/pos'
test_neg_folder = './data/aclImdb/test/neg'

test_pos_sentences = [open(os.path.join(test_pos_folder, f)).read().strip() for f in os.listdir(test_pos_folder)]
test_neg_sentences = [open(os.path.join(test_neg_folder, f)).read().strip() for f in os.listdir(test_neg_folder)]

test_df = pd.DataFrame({
    'text': test_pos_sentences + test_neg_sentences,
    'label': [1] * len(test_pos_sentences) + [0] * len(test_neg_sentences)  # 1 for positive, 0 for negative
})

In [1]:
test_df

NameError: name 'test_df' is not defined

In [6]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the texts
train_text = train_df['text'].tolist()
test_text = test_df['text'].tolist()
train_encodings = tokenizer(train_text, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_text, truncation=True, padding=True, max_length=512)



In [7]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
# Create Dataset Instances
train_labels = train_df['label'].tolist()
test_labels = test_df['label'].tolist()

train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

In [9]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(train_labels)))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [11]:
# Start training
trainer.train()

  0%|          | 0/9375 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.4636, 'grad_norm': 9.789751052856445, 'learning_rate': 5e-05, 'epoch': 0.16}
{'loss': 0.3788, 'grad_norm': 7.839113235473633, 'learning_rate': 4.71830985915493e-05, 'epoch': 0.32}
{'loss': 0.3352, 'grad_norm': 16.014102935791016, 'learning_rate': 4.436619718309859e-05, 'epoch': 0.48}
{'loss': 0.327, 'grad_norm': 22.45017433166504, 'learning_rate': 4.154929577464789e-05, 'epoch': 0.64}
{'loss': 0.3042, 'grad_norm': 1.0483291149139404, 'learning_rate': 3.8732394366197184e-05, 'epoch': 0.8}
{'loss': 0.2792, 'grad_norm': 17.93540382385254, 'learning_rate': 3.5915492957746486e-05, 'epoch': 0.96}
{'loss': 0.23, 'grad_norm': 0.2011105865240097, 'learning_rate': 3.3098591549295775e-05, 'epoch': 1.12}
{'loss': 0.2088, 'grad_norm': 26.24764060974121, 'learning_rate': 3.028169014084507e-05, 'epoch': 1.28}
{'loss': 0.202, 'grad_norm': 24.78582191467285, 'learning_rate': 2.746478873239437e-05, 'epoch': 1.44}
{'loss': 0.1828, 'grad_norm': 6.840468406677246, 'learning_rate': 2.464788732394

TrainOutput(global_step=9375, training_loss=0.2052007303873698, metrics={'train_runtime': 6369.7128, 'train_samples_per_second': 11.774, 'train_steps_per_second': 1.472, 'total_flos': 1.9733329152e+16, 'train_loss': 0.2052007303873698, 'epoch': 3.0})

In [12]:
# Evaluate the model on the test dataset
eval_results = trainer.evaluate()
print(eval_results)

  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.3445657789707184, 'eval_runtime': 628.0659, 'eval_samples_per_second': 39.805, 'eval_steps_per_second': 2.489, 'epoch': 3.0}


In [23]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get the predicted probabilities and true labels
preds = trainer.predict(test_dataset)
pred_probs = preds.predictions
labels = preds.label_ids

# Convert predicted probabilities to class labels (0 or 1)
pred_labels = (pred_probs[:, 1] > 0.5).astype(int)

# Compute accuracy
accuracy = accuracy_score(labels, pred_labels)

# Compute precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(labels, pred_labels, average='binary')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

  0%|          | 0/1563 [00:00<?, ?it/s]

Accuracy: 0.93708
Precision: 0.936556132640831
Recall: 0.93768
F1 Score: 0.9371177293623826
