## Running Document Classification BERT

In [1]:
import torch
import gc

# Delete the model, optimizer and any other large variables
try:
    del model
    del optimizer
except NameError:
    pass
torch.cuda.empty_cache()

# Manually trigger garbage collection
gc.collect()

0

In [2]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

2023-12-10 15:48:12.680998: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Read and Split data

In [3]:
reviews = pd.read_csv('reviews_sample_stratified_preprocessed_1.csv')
reviews = reviews.dropna()
reviews['sentiment'] = reviews['sentiment'].map({'positive': 1, 'negative': 0})
reviews = reviews[['reviewText','sentiment']]
train, test = train_test_split(reviews, test_size=0.2, random_state=42)

-------

### Intialize the model

In [6]:
# Initialize the tokenizer and model from Hugging Face
# import distilled bert
id2label = {0: 'negative', 1: 'positive'}
label2id = {'negative': 0, 'positive': 1}

model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

### data loader

In [7]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_token_len=512):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]

        encoding = self.tokenizer(
            row["reviewText"],
            max_length=self.max_token_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        # Assuming your sentiment is a single integer label
        label = torch.tensor(row['sentiment'])

        return dict(
            input_ids=encoding['input_ids'].flatten(),
            attention_mask=encoding['attention_mask'].flatten(),
            labels=label
        )

train_dataset = CustomDataset(train, tokenizer)
val_dataset = CustomDataset(test, tokenizer)


In [8]:
# DataLoaders for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

model = model.to('cuda' if torch.cuda.is_available() else exit())

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

#Epochs
nums_epoch = 50

from sklearn.metrics import accuracy_score

# Training loop
for epoch in range(nums_epoch):  # Set the number of epochs
    model.train() 
    total_train_loss = 0

    # Using tqdm for the progress bar
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{nums_epoch}", unit="batch"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Training Loss: {avg_train_loss}")

    # Validation loop
    model.eval()
    total_eval_accuracy = 0
    for batch in val_loader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_eval_accuracy += accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())

    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f"Validation Accuracy: {avg_val_accuracy}")

Epoch 1/50: 100%|██████████| 2500/2500 [20:34<00:00,  2.02batch/s]


Training Loss: 0.2507483692318201
Validation Accuracy: 0.9233933333333333


Epoch 2/50: 100%|██████████| 2500/2500 [20:30<00:00,  2.03batch/s]


Training Loss: 0.14806643503159284
Validation Accuracy: 0.9237933333333332


Epoch 3/50: 100%|██████████| 2500/2500 [20:29<00:00,  2.03batch/s]


Training Loss: 0.09267619093721732
Validation Accuracy: 0.9209933333333332


Epoch 4/50: 100%|██████████| 2500/2500 [20:29<00:00,  2.03batch/s]


Training Loss: 0.05829818065944128
Validation Accuracy: 0.9269


Epoch 5/50: 100%|██████████| 2500/2500 [20:29<00:00,  2.03batch/s]


Training Loss: 0.03918841185837518
Validation Accuracy: 0.9233


Epoch 6/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.02931382651287713
Validation Accuracy: 0.9246933333333333


Epoch 7/50: 100%|██████████| 2500/2500 [20:29<00:00,  2.03batch/s]


Training Loss: 0.024952958113412024
Validation Accuracy: 0.9192933333333333


Epoch 8/50: 100%|██████████| 2500/2500 [20:29<00:00,  2.03batch/s]


Training Loss: 0.02054100323774037
Validation Accuracy: 0.9214933333333333


Epoch 9/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.019715475462644825
Validation Accuracy: 0.9263933333333333


Epoch 10/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.01737245107320632
Validation Accuracy: 0.9230933333333332


Epoch 11/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.015308421369682764
Validation Accuracy: 0.9261


Epoch 12/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.014541490527516725
Validation Accuracy: 0.9206


Epoch 13/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.01369464606869733
Validation Accuracy: 0.9229933333333332


Epoch 14/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.0124646314116555
Validation Accuracy: 0.9271866666666667


Epoch 15/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.01161898533655185
Validation Accuracy: 0.9268866666666666


Epoch 16/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.010287173790900488
Validation Accuracy: 0.9233866666666667


Epoch 17/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.009877962694518646
Validation Accuracy: 0.9252866666666667


Epoch 18/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.01003939304575506
Validation Accuracy: 0.9277799999999999


Epoch 19/50: 100%|██████████| 2500/2500 [20:29<00:00,  2.03batch/s]


Training Loss: 0.009947417823477371
Validation Accuracy: 0.9240866666666667


Epoch 20/50: 100%|██████████| 2500/2500 [20:29<00:00,  2.03batch/s]


Training Loss: 0.009799671004540505
Validation Accuracy: 0.9225933333333333


Epoch 21/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.009147979067708912
Validation Accuracy: 0.9243933333333333


Epoch 22/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.00824640431496664
Validation Accuracy: 0.9262933333333333


Epoch 23/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.01018844948242495
Validation Accuracy: 0.9280866666666667


Epoch 24/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.008592240310766646
Validation Accuracy: 0.9240866666666667


Epoch 25/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.00914196500207654
Validation Accuracy: 0.9265933333333333


Epoch 26/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.007577343508754711
Validation Accuracy: 0.92438


Epoch 27/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.008759272717843486
Validation Accuracy: 0.9143866666666667


Epoch 28/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.00801936988495363
Validation Accuracy: 0.9234933333333333


Epoch 29/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.008233810360988355
Validation Accuracy: 0.9267933333333332


Epoch 30/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.00638351010272163
Validation Accuracy: 0.9258866666666666


Epoch 31/50: 100%|██████████| 2500/2500 [20:28<00:00,  2.03batch/s]


Training Loss: 0.0072351508305240715
Validation Accuracy: 0.9256933333333333


Epoch 32/50: 100%|██████████| 2500/2500 [20:29<00:00,  2.03batch/s]


Training Loss: 0.007799127114740258
Validation Accuracy: 0.9238933333333332


Epoch 33/50:   1%|          | 29/2500 [00:14<20:17,  2.03batch/s]

### Save model

In [None]:
   # Save the model after each epoch
model.save_pretrained("50epochsbertrealmodelcased")
tokenizer.save_pretrained("50epochsbertrealtokenizercased")

('20epochsbertrealtokenizer/tokenizer_config.json',
 '20epochsbertrealtokenizer/special_tokens_map.json',
 '20epochsbertrealtokenizer/vocab.txt',
 '20epochsbertrealtokenizer/added_tokens.json')