In [None]:
from read_and_preprocess import read_reviews
from sklearn.model_selection import train_test_split
import numpy as np

# Read positive and negative reviews
pos_reviews = read_reviews('../data/pos') 
neg_reviews = read_reviews('../data/neg') 

# Merge them in single dictionary
all_reviews = {}
all_reviews.update(pos_reviews)
all_reviews.update(neg_reviews)

# Splitting reviews and labels up
X = np.array([review['content'] for review in all_reviews.values()])
y = np.array([review['label'] for review in all_reviews.values()])

# Separate out the train, dev, and test sets
X_train_dev, X_test, y_train_dev, y_test = train_test_split(X, y, test_size = 0.15, stratify = y, random_state = 31)
X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev, y_train_dev, test_size = 0.15 / 0.85, stratify = y_train_dev, random_state = 31)

In [None]:
# Implementing BERT
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Using the BERT tokenizer for the base model 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # uncased
#tokenizer = BertTokenizer.from_pretrained('bert-base-cased') # cased

def preprocess_reviews(contents):
    """Preprocess contents for BERT model."""
    preprocessed_contents = tokenizer(contents, padding=True, truncation=True, max_length=512, return_tensors='pt')
    return preprocessed_contents

# Apply tokenization to the train, dev, and test splits
X_train_tokenized = preprocess_reviews(X_train.tolist())
X_dev_tokenized = preprocess_reviews(X_dev.tolist())
X_test_tokenized = preprocess_reviews(X_test.tolist())

# Create a Dataset class for Trainer to use
class ReviewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Retrieve each encoding type: input_ids, attention_mask, 
        # slicing the tensor to include only idx of given review
        item = {key: val[idx] for key, val in self.encodings.items()}
        # Named labels since Trainer API expects it (could be multi-label classification)
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create Datasets objects for the Trainer
train_dataset = ReviewsDataset(X_train_tokenized, torch.tensor(y_train))
dev_dataset = ReviewsDataset(X_dev_tokenized, torch.tensor(y_dev))
test_dataset = ReviewsDataset(X_test_tokenized, torch.tensor(y_test))

# Using the base BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased') # uncased
#model = BertForSequenceClassification.from_pretrained('bert-base-cased') # cased

# Defining arguments to provide to the Trainer
arguments = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    warmup_ratio=0.1, # gradually increases learning rate after warmup
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True, # loads the best model at the end of training
    evaluation_strategy="epoch", # evaluates at the end of each epoch
    save_strategy="epoch",  # save at the end of each epoch
)

# Creating the Trainer by loading model, arguments, and train and eval sets
trainer = Trainer(
    model=model, 
    args=arguments, 
    train_dataset=train_dataset, 
    eval_dataset=dev_dataset
)

# Train the BERT model
trainer.train()

In [None]:
# Evaluate the BERT model (choosing best from dev set)
best_trainer = trainer
best_trainer.evaluate(test_dataset)