In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

df = pd.read_csv('data_clean.csv')

# Make sure your text data does not contain NaN values
df['lemmatized-comment'].replace('', np.nan, inplace=True)
df.dropna(subset=['lemmatized-comment'], inplace=True)

# Split the data into training and validation datasets
train_text, temp_text, train_labels, temp_labels = train_test_split(df['lemmatized-comment'], df['numerical-label'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=df['numerical-label'])

# We'll further split the validation set into validation and test datasets
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)



In [2]:
from transformers import BertTokenizerFast
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

# Load pre-trained model tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Encode the text
train_encodings = tokenizer(train_text.tolist(), truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_text.tolist(), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_text.tolist(), truncation=True, padding=True, max_length=256)

# Convert inputs to PyTorch tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
validation_inputs = torch.tensor(val_encodings['input_ids'])
train_labels = torch.tensor(train_labels.tolist())
validation_labels = torch.tensor(val_labels.tolist())
train_masks = torch.tensor(train_encodings['attention_mask'])
validation_masks = torch.tensor(val_encodings['attention_mask'])
test_inputs = torch.tensor(test_encodings['input_ids'])
test_labels = torch.tensor(test_labels.tolist())
test_masks = torch.tensor(test_encodings['attention_mask'])

# Create DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=16)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=16)



In [3]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import numpy as np
from tqdm import tqdm, trange

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 16) # assuming you have 16 types of MBTI
model.to(device)

# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*3) # assuming you want 3 epochs

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 6

for _ in trange(epochs, desc="Epoch"):  
    # Training
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        if step % 250 == 0:
            print("Batch: {0}/{1}, Loss: {2:.2f}".format(step, len(train_dataloader), loss.item()))

    avg_train_loss = total_loss / len(train_dataloader)            
    train_loss_set.append(avg_train_loss)
    print("Average training loss: {0:.2f}".format(avg_train_loss))

    # Validation
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    torch.save(model.state_dict(), f'checkpoint2_{_}.pt') # Save the model checkpoint

# Save the final model
torch.save(model.state_dict(), 'bert_model2.pt')

def evaluate_test_set(test_dataloader):
    # Put model in evaluation mode to evaluate loss on the test set
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in test_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs.logits

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Test Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))

# Call the function to evaluate the test set
evaluate_test_set(test_dataloader)



cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

Batch: 0/6757, Loss: 2.93
Batch: 250/6757, Loss: 2.38
Batch: 500/6757, Loss: 2.41
Batch: 750/6757, Loss: 2.54
Batch: 1000/6757, Loss: 2.31
Batch: 1250/6757, Loss: 2.37
Batch: 1500/6757, Loss: 2.37
Batch: 1750/6757, Loss: 2.27
Batch: 2000/6757, Loss: 2.73
Batch: 2250/6757, Loss: 2.39
Batch: 2500/6757, Loss: 2.45
Batch: 2750/6757, Loss: 2.81
Batch: 3000/6757, Loss: 2.33
Batch: 3250/6757, Loss: 2.19
Batch: 3500/6757, Loss: 2.31
Batch: 3750/6757, Loss: 2.11
Batch: 4000/6757, Loss: 1.82
Batch: 4250/6757, Loss: 2.19
Batch: 4500/6757, Loss: 2.65
Batch: 4750/6757, Loss: 2.31
Batch: 5000/6757, Loss: 2.35
Batch: 5250/6757, Loss: 2.46
Batch: 5500/6757, Loss: 2.20
Batch: 5750/6757, Loss: 2.03
Batch: 6000/6757, Loss: 2.21
Batch: 6250/6757, Loss: 2.03
Batch: 6500/6757, Loss: 2.20
Batch: 6750/6757, Loss: 2.14
Average training loss: 2.39
Validation Accuracy: 0.25


Epoch:  17%|█▋        | 1/6 [57:11<4:45:57, 3431.42s/it]

Batch: 0/6757, Loss: 2.63
Batch: 250/6757, Loss: 2.34
Batch: 500/6757, Loss: 2.17
Batch: 750/6757, Loss: 2.34
Batch: 1000/6757, Loss: 2.40
Batch: 1250/6757, Loss: 1.96
Batch: 1500/6757, Loss: 2.51
Batch: 1750/6757, Loss: 2.33
Batch: 2000/6757, Loss: 2.45
Batch: 2250/6757, Loss: 2.10
Batch: 2500/6757, Loss: 2.17
Batch: 2750/6757, Loss: 2.49
Batch: 3000/6757, Loss: 2.38
Batch: 3250/6757, Loss: 2.08
Batch: 3500/6757, Loss: 2.11
Batch: 3750/6757, Loss: 2.04
Batch: 4000/6757, Loss: 2.27
Batch: 4250/6757, Loss: 2.08
Batch: 4500/6757, Loss: 2.71
Batch: 4750/6757, Loss: 2.50
Batch: 5000/6757, Loss: 2.16
Batch: 5250/6757, Loss: 2.27
Batch: 5500/6757, Loss: 2.16
Batch: 5750/6757, Loss: 2.27
Batch: 6000/6757, Loss: 2.33
Batch: 6250/6757, Loss: 2.24
Batch: 6500/6757, Loss: 2.19
Batch: 6750/6757, Loss: 2.04
Average training loss: 2.23
Validation Accuracy: 0.27


Epoch:  33%|███▎      | 2/6 [1:54:28<3:48:58, 3434.62s/it]

Batch: 0/6757, Loss: 2.23
Batch: 250/6757, Loss: 2.16
Batch: 500/6757, Loss: 1.97
Batch: 750/6757, Loss: 2.19
Batch: 1000/6757, Loss: 1.97
Batch: 1250/6757, Loss: 2.27
Batch: 1500/6757, Loss: 2.14
Batch: 1750/6757, Loss: 2.03
Batch: 2000/6757, Loss: 2.29
Batch: 2250/6757, Loss: 2.05
Batch: 2500/6757, Loss: 2.07
Batch: 2750/6757, Loss: 1.90
Batch: 3000/6757, Loss: 2.79
Batch: 3250/6757, Loss: 2.26
Batch: 3500/6757, Loss: 2.01
Batch: 3750/6757, Loss: 2.09
Batch: 4000/6757, Loss: 1.95
Batch: 4250/6757, Loss: 2.18
Batch: 4500/6757, Loss: 2.15
Batch: 4750/6757, Loss: 2.00
Batch: 5000/6757, Loss: 2.31
Batch: 5250/6757, Loss: 2.07
Batch: 5500/6757, Loss: 2.36
Batch: 5750/6757, Loss: 2.00
Batch: 6000/6757, Loss: 2.05
Batch: 6250/6757, Loss: 2.16
Batch: 6500/6757, Loss: 1.93
Batch: 6750/6757, Loss: 1.82
Average training loss: 2.13
Validation Accuracy: 0.27


Epoch:  50%|█████     | 3/6 [2:51:45<2:51:47, 3435.81s/it]

Batch: 0/6757, Loss: 2.23
Batch: 250/6757, Loss: 2.17
Batch: 500/6757, Loss: 2.15
Batch: 750/6757, Loss: 2.19
Batch: 1000/6757, Loss: 1.95
Batch: 1250/6757, Loss: 1.94
Batch: 1500/6757, Loss: 2.13
Batch: 1750/6757, Loss: 2.37
Batch: 2000/6757, Loss: 2.03
Batch: 2250/6757, Loss: 2.40
Batch: 2500/6757, Loss: 2.02
Batch: 2750/6757, Loss: 2.01
Batch: 3000/6757, Loss: 2.50
Batch: 3250/6757, Loss: 2.14
Batch: 3500/6757, Loss: 2.18
Batch: 3750/6757, Loss: 2.66
Batch: 4000/6757, Loss: 2.23
Batch: 4250/6757, Loss: 1.91
Batch: 4500/6757, Loss: 2.35
Batch: 4750/6757, Loss: 2.67
Batch: 5000/6757, Loss: 2.14
Batch: 5250/6757, Loss: 1.81
Batch: 5500/6757, Loss: 2.17
Batch: 5750/6757, Loss: 2.33
Batch: 6000/6757, Loss: 1.87
Batch: 6250/6757, Loss: 2.26
Batch: 6500/6757, Loss: 2.18
Batch: 6750/6757, Loss: 2.31
Average training loss: 2.08
Validation Accuracy: 0.27


Epoch:  67%|██████▋   | 4/6 [3:49:07<1:54:36, 3438.07s/it]

Batch: 0/6757, Loss: 1.64
Batch: 250/6757, Loss: 2.26
Batch: 500/6757, Loss: 2.14
Batch: 750/6757, Loss: 2.46
Batch: 1000/6757, Loss: 2.18
Batch: 1250/6757, Loss: 2.19
Batch: 1500/6757, Loss: 1.86
Batch: 1750/6757, Loss: 2.09
Batch: 2000/6757, Loss: 2.32
Batch: 2250/6757, Loss: 2.10
Batch: 2500/6757, Loss: 1.90
Batch: 2750/6757, Loss: 1.90
Batch: 3000/6757, Loss: 2.18
Batch: 3250/6757, Loss: 2.13
Batch: 3500/6757, Loss: 2.16
Batch: 3750/6757, Loss: 1.79
Batch: 4000/6757, Loss: 1.88
Batch: 4250/6757, Loss: 1.86
Batch: 4500/6757, Loss: 2.14
Batch: 4750/6757, Loss: 2.37
Batch: 5000/6757, Loss: 1.89
Batch: 5250/6757, Loss: 1.94
Batch: 5500/6757, Loss: 2.02
Batch: 5750/6757, Loss: 1.93
Batch: 6000/6757, Loss: 2.28
Batch: 6250/6757, Loss: 2.10
Batch: 6500/6757, Loss: 1.93
Batch: 6750/6757, Loss: 2.42
Average training loss: 2.09
Validation Accuracy: 0.27


Epoch:  83%|████████▎ | 5/6 [4:46:44<57:25, 3445.04s/it]  

Batch: 0/6757, Loss: 1.72
Batch: 250/6757, Loss: 1.95
Batch: 500/6757, Loss: 1.91
Batch: 750/6757, Loss: 1.86
Batch: 1000/6757, Loss: 2.32
Batch: 1250/6757, Loss: 1.67
Batch: 1500/6757, Loss: 2.11
Batch: 1750/6757, Loss: 2.19
Batch: 2000/6757, Loss: 2.63
Batch: 2250/6757, Loss: 2.22
Batch: 2500/6757, Loss: 1.80
Batch: 2750/6757, Loss: 1.83
Batch: 3000/6757, Loss: 2.25
Batch: 3250/6757, Loss: 2.43
Batch: 3500/6757, Loss: 2.46
Batch: 3750/6757, Loss: 2.09
Batch: 4000/6757, Loss: 2.09
Batch: 4250/6757, Loss: 1.87
Batch: 4500/6757, Loss: 1.60
Batch: 4750/6757, Loss: 2.17
Batch: 5000/6757, Loss: 1.74
Batch: 5250/6757, Loss: 2.42
Batch: 5500/6757, Loss: 1.88
Batch: 5750/6757, Loss: 1.95
Batch: 6000/6757, Loss: 2.21
Batch: 6250/6757, Loss: 2.05
Batch: 6500/6757, Loss: 2.12
Batch: 6750/6757, Loss: 1.97
Average training loss: 2.09
Validation Accuracy: 0.27


Epoch: 100%|██████████| 6/6 [5:44:08<00:00, 3441.48s/it]


Test Accuracy: 0.27


: 