# Load Essential Libraries

In [1]:
import transformers
transformers.logging.set_verbosity_error()
%matplotlib inline

# Set up GPU for training

In [2]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: TITAN RTX


In [3]:
# import torch
# if torch.backends.mps.is_available():
#     device = torch.device("mps")
#     print('mps supported')
# else:
#     print ("MPS device not found.")

# Load Data

In [4]:
import numpy as np
import pandas as pd

#read file
df = pd.read_csv('train.csv')
df1 = pd.read_csv('dev.csv')

df1 = df1.rename(columns={'labels': 'label'})

df = pd.concat([df, df1], ignore_index=True)

#select only text, tweet ids, sentiment label and sentiment agree columns
df = df[['text','label']]

X = df[['text']]
y = df['label']

In [5]:
X.shape

(8722, 1)

# Fine-tuning BERT

In [6]:
MAX_LEN = 512
LEARNING_RATE = 2e-05

In [7]:
def text_preprocessing(text):

    text = text

    return text

In [8]:
import torch
from transformers import AlbertTokenizer

# Load the BERT tokenizer
pretrained_bertmodel = 'albert-base-v2'  # Specify the ALBERT model you want to use
tokenizer = AlbertTokenizer.from_pretrained(pretrained_bertmodel)

# Create a function to tokenize a set of texts
def preprocessing_for_bert(df):
    """Perform required preprocessing steps for pretrained BERT.
    @param    df (pd.DataFrame): DataFrame containing text1 and text2 columns to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every row in the dataframe...
    for index, row in df.iterrows():
        # `encode_plus` will:
        #    (1) Tokenize the sentences
        #    (2) Add the `[CLS]` and `[SEP]` tokens to the start and end
        #    (3) Truncate/Pad sentences to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention masks
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(row['text']),
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_attention_mask=True
            )

        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [9]:
X_inputs, X_masks = preprocessing_for_bert(X)



In [10]:
y=torch.tensor(y)

# Train Model

## Create BertClassifier

In [11]:
import torch.nn as nn
from transformers import AlbertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Binary Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        super(BertClassifier, self).__init__()

        D_in, H, D_out = 768, 50, 1  # Change D_out to 1 for binary classification

        self.bert = AlbertModel.from_pretrained(pretrained_bertmodel)

        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.35),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state_cls = outputs[0][:, 0, :]
        logits = self.classifier(last_hidden_state_cls)
        return logits.squeeze()  # Squeeze the output to remove the extra dimension

## Optimizer & Learning Rate Scheduler

In [12]:
from transformers import get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer, and the learning rate scheduler."""
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = torch.optim.AdamW(bert_classifier.parameters(), lr=LEARNING_RATE)

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,  # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

## Training Loop

In [13]:
import random
import time

# Specify loss function
loss_fn = nn.MSELoss()
threshold = 0.5
# loss_fn = nn.CrossEntropyLoss() # No adjust weight
# loss_fn = nn.CrossEntropyLoss(weight = torch.tensor(class_weights, dtype=torch.float)) # Adjust weight


def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Convert true labels to Float data type
            b_labels = b_labels.float()

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Compute logits
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss
            loss = loss_fn(logits.view(-1), b_labels.view(-1))

            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch

            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")

    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits.squeeze(), b_labels.float())
        val_loss.append(loss.item())

        # Get the predictions
        preds = (logits > threshold).long().flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

# BERT PREDICT

In [14]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader, threshold=0.5):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers care disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Apply sigmoid to convert logits to probabilities
        probs = logits.cpu()
#         probs = torch.sigmoid(logits).cpu().numpy()
        all_logits.append(probs)

    # Concatenate probabilities from each batch
    all_probs = np.concatenate(all_logits, axis=0)

    return all_probs

# # Compute predicted probabilities on the test set
# probs = bert_predict(bert_classifier, val_dataloader)


# KFOLD

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import os
import pandas as pd
import torch
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

# Create a directory to save the results
rootdir = "LRADAMWDROPOUT_alBERT_fold_results"
if not os.path.exists("%s" % rootdir):
    os.mkdir(rootdir)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

batch_size = 16# Loop over each fold
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold}")
    fold_metrics = []
    probsandlab = [['Probability', 'Label']]
    # Create a directory for the current fold
    fold_dir = f"%s/fold_{fold}" % rootdir
    if not os.path.exists(fold_dir):
        os.mkdir(fold_dir)

    # Create the DataLoader for our training set
    train_data = TensorDataset(X_inputs[train_idx], X_masks[train_idx], y[train_idx])
    train_sampler = RandomSampler(train_data) # No adjust weight
    # train_sampler = train_subsampler # Adjusted weight
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create the DataLoader for our validation set
    val_data = TensorDataset(X_inputs[val_idx], X_masks[val_idx], y[val_idx])
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    # Create a directory to save the fold's epoch results
    epoch_results_dir = f"{fold_dir}/epoch_results"
    if not os.path.exists(epoch_results_dir):
        os.mkdir(epoch_results_dir)

    set_seed(42)    # Set seed for reproducibility
    bert_classifier, optimizer, scheduler = initialize_model(epochs=2)

    # Loop over epochs
    for epoch in range(1, 4):
        print(f"Epoch {epoch}")
        train(bert_classifier, train_dataloader, val_dataloader, epochs=1, evaluation=True)

        # Save the model
        model_path = f"{epoch_results_dir}/model_epoch_{epoch}.pt"
        torch.save(bert_classifier.state_dict(), model_path)

        # Compute predicted probabilities on the validation set
        val_probs = bert_predict(bert_classifier, val_dataloader)
        val_preds = (val_probs > 0.5).astype(int)
        val_labels = y[val_idx].detach().numpy()

        # Convert val_probs and val_labels to lists
        val_probs_list = val_probs.tolist()
        val_labels_list = val_labels.tolist()

        # Prepare the data for writing to CSV
        probsandlab.extend(list(zip(val_probs_list, val_labels_list)))

        # Create a DataFrame for the fold's metrics
        fold_metrics.append({
            'epoch': epoch,
            'predicted_probs': val_probs,
            'predicted_label': val_preds,
            'true_label': val_labels,
            'accuracy': accuracy_score(val_labels, val_preds),
            'precision': precision_score(val_labels, val_preds),
            'recall': recall_score(val_labels, val_preds),
            'f1': f1_score(val_labels, val_preds),
        })

        # Save the classification report to a text file
        report_path = f"{epoch_results_dir}/classification_report_epoch_{epoch}.txt"
        with open(report_path, 'w') as report_file:
            report_file.write(classification_report(val_labels, val_preds,digits=6))

    metricss_path = f"{fold_dir}/probsandlab_{fold}_bert.csv"
    probsandlab_df = pd.DataFrame(probsandlab[1:], columns=probsandlab[0])
    probsandlab_df.to_csv(metricss_path, index=False)

# Create a DataFrame for the fold's metrics
    fold_df = pd.DataFrame(fold_metrics)

    # Save the fold's metrics to a CSV file
    metrics_path = f"{fold_dir}/fold_{fold}_bert.csv"
    fold_df.to_csv(metrics_path, index=False)

    # Plot the validation accuracy over the number of epochs
    plt.plot(fold_df['epoch'], fold_df['accuracy'])
    plt.xlabel('Epoch')
    plt.ylabel('Validation Accuracy')
    plt.title(f'Fold {fold} - Validation Accuracy')
    plt.savefig(f"{fold_dir}/fold_{fold}_accuracy_plot.jpg")
    plt.close()

Fold 0
Epoch 1
Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.293651   |     -      |     -     |   11.44  
   1    |   40    |   0.231557   |     -      |     -     |   10.75  
   1    |   60    |   0.238886   |     -      |     -     |   10.80  
   1    |   80    |   0.257182   |     -      |     -     |   10.84  
   1    |   100   |   0.213266   |     -      |     -     |   10.90  
   1    |   120   |   0.201072   |     -      |     -     |   10.97  
   1    |   140   |   0.190723   |     -      |     -     |   10.92  
   1    |   160   |   0.207503   |     -      |     -     |   11.03  
   1    |   180   |   0.187047   |     -      |     -     |   11.07  
   1    |   200   |   0.200491   |     -      |     -     |   11.13  
   1    |   220   |   0.213125   |     -      |     -     |   11.16  
   1    |   240   |   0.173647   |     -      |     -  