# Load Essential Libraries

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import transformers
transformers.logging.set_verbosity_error()
from transformers import BertTokenizer
from transformers import BertModel

from IPython.display import clear_output
import os
import numpy as np
import pandas as pd
import random
import json
from tqdm import tqdm
from sklearn import metrics
import time
from sklearn.model_selection import train_test_split

# Set up GPU for training

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Load Data

In [None]:
#read train file
df_train = pd.read_csv(f'simple_CH_train.csv')
df_test = pd.read_csv(f'simple_CH_test.csv')

# data increment

In [None]:
import pandas as pd

def incremental_concat(df):
    df_copy = df.copy()
    df = pd.concat([df, df_copy], ignore_index=True)
    return df

df_train = incremental_concat(df_train)
df_train

In [None]:
df_train.reset_index(drop=True, inplace=True)
df_train

# organising materials

In [None]:
#select only text, label
data_label = df_train["label"].values
y_train = df_train['label'].values
df_train = df_train[['text1','text2','combined_text']]
df_train.shape

# Fine-tuning BERT

In [None]:
MAX_LEN = 280
LEARNING_RATE = 2e-5

In [None]:
from transformers import BertTokenizer,BertTokenizerFast,AutoTokenizer, TFAutoModelForSequenceClassification, AutoModelForMaskedLM
# Load the BERT tokenizer

tokenizer = BertTokenizer.from_pretrained("hfl/chinese-macbert-large")

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []
    token_type_ids = []

    # For every sentence...
    for sent1, sent2, sent3 in zip(data['text1'].astype(str), data['text2'].astype(str), data['combined_text'].astype(str)):
        encoded_dict = tokenizer.encode_plus(
            sent1, sent2 + " [SEP] " + sent3,
            add_special_tokens=True,
            max_length=MAX_LEN,
            truncation_strategy='longest_first',
            return_attention_mask=True,
            return_tensors='pt',
            padding='max_length'
        )

        input_ids.append(encoded_dict['input_ids'].squeeze())
        attention_masks.append(encoded_dict['attention_mask'].squeeze())
        token_type_ids.append(encoded_dict['token_type_ids'].squeeze())

    input_ids = torch.stack(input_ids, dim=0)
    attention_masks = torch.stack(attention_masks, dim=0)
    token_type_ids = torch.stack(token_type_ids, dim=0)

    return input_ids, attention_masks, token_type_ids

train_inputs, train_masks,train_token_type_ids = preprocessing_for_bert(df_train)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)

# Create the DataLoader for our training set
batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_token_type_ids,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Train Model

## Create BertClassifier

In [None]:
%%time
from transformers import BertModel, AutoModel,BertForSequenceClassification, AutoModelForTokenClassification, BertConfig

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, bert_model, freeze_bert=False):#改
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.classifier = nn.Sequential(
            nn.Linear(bert_model.config.hidden_size, H),
            nn.ReLU(),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask, token_type_ids):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids = token_type_ids
                           )

        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits
H = 24  # Example value for the hidden size
D_out = 3
bert_model = BertModel.from_pretrained("hfl/chinese-macbert-large")

## Optimizer & Learning Rate Scheduler

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
Epochs = 4
def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(bert_model)
    # bert_classifier = BertClassifier()

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = torch.optim.Adam(bert_classifier.parameters(), lr=LEARNING_RATE)
    # optimizer = torch.optim.AdamW(bert_classifier.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

## Training Loop

In [None]:
import matplotlib.pyplot as plt

def plot_loss(train_loss, val_loss):
    # Create a figure and axis
    fig, ax = plt.subplots()

    # Plot train loss
    ax.plot(train_loss, label='Train Loss')

    # Plot val loss
    ax.plot(val_loss, label='Val Loss')

    # Set labels and title
    ax.set_xlabel('Epochs')
    ax.set_ylabel('Loss')
    ax.set_title('Training and Validation Loss')

    # Add legend
    ax.legend()
    # Show the plot
    plt.show()

In [None]:
# Specify loss function
loss_fn = nn.CrossEntropyLoss() # No adjust weight
from sklearn.metrics import f1_score
# loss_fn = nn.CrossEntropyLoss(weight = torch.tensor(class_weights, dtype=torch.float)) # Adjust weight


def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=Epochs, evaluation=False, save_best_model = True):
    """Train the BertClassifier model.
    """
    train_losses = []
    val_losses = []
    best_macro_f1 = 0.0
    best_val_loss = float('inf')
    
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Macro-F1':^9.2} | {'Elapsed':^9}")

        print("-"*80)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask, b_token_type_ids)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9.2} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)
        train_losses.append(avg_train_loss)
        print("-"*80)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, macro_f1 = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                if save_best_model:
                    torch.save(model.state_dict(), f"./final_github/best{epoch_i}_model.pt")
            
            if macro_f1 > best_macro_f1:
                best_macro_f1 = macro_f1
                if save_best_model:
                    torch.save(model.state_dict(), f"./final_github/best{epoch_i}_model.pt")
                    
            # Append train and val losses to the lists
            train_losses.append(avg_train_loss)
            val_losses.append(val_loss)
            
            print(f"{epoch_i + 1:^7} | {step:^7} | {avg_train_loss:^12f} | {val_loss:^10f} | {macro_f1:^9.2f} | {time_elapsed:^9f}")
            print("-" * 80)
            
        print("\n")

    print("Training complete!")
    plot_loss(train_losses, val_losses)
    return best_val_loss, best_macro_f1

def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    
    model.eval()

    # Tracking variables
    val_loss = []
    val_predictions = []
    val_labels = []
    val_accuracy = []
    macro_f1 = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask, b_token_type_ids)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()
        val_predictions.extend(preds.cpu().numpy())
        val_labels.extend(b_labels.cpu().numpy())
        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)
        # Compute the macro F1 score
        macro_f1_score = f1_score(val_labels, val_predictions, average='macro')
        macro_f1.append(macro_f1_score)

    # Compute the average loss over the validation set
    val_loss = np.mean(val_loss)
    macro_f1 = np.mean(macro_f1)

    return val_loss, macro_f1

In [None]:
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import os
batch_size = 16
Epoch = 5

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

best_val_loss = float('inf')
best_macro_f1 = 0.0
best_model = None

# Loop over each fold
for fold, (train_idx, val_idx) in enumerate(skf.split(df_train, train_labels)):
    bert_model = BertModel.from_pretrained("hfl/chinese-macbert-large")
    print(f"Fold {fold}")
    fold_performance = []
    fold_metrics = []
    # Create the DataLoader for our training set
    train_data = TensorDataset(train_inputs[train_idx], train_masks[train_idx], train_token_type_ids[train_idx],
                               train_labels[train_idx])
    train_sampler = RandomSampler(train_data)  # No adjust weight
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create the DataLoader for our validation set
    val_data = TensorDataset(train_inputs[val_idx], train_masks[val_idx], train_token_type_ids[val_idx],
                             train_labels[val_idx])
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    %time
    set_seed(42)  # Set seed for reproducibility
    bert_classifier, optimizer, scheduler = initialize_model(epochs=Epoch)
    val_loss, macro_f1= train(bert_classifier, train_dataloader, val_dataloader, epochs=Epoch, evaluation=True, save_best_model = True)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = bert_classifier.state_dict()

    # Check if the current validation accuracy is better than the best validation accuracy
    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1

# Save the best model
if best_model is not None:
    torch.save(best_model, f"./final_github/best_model.pt")

# Predict

In [None]:
def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """            
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_token_type_ids = tuple(t.to(device) for t in batch)[:3]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask, b_token_type_ids)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()
                                    
    return probs

In [None]:
test_text_df = df_test[['text1','text2','combined_text']]
test_label = df_test["label"].values
y_test = test_label
test_inputs, test_masks,test_token_type_ids = preprocessing_for_bert(test_text_df)
test_data = TensorDataset(test_inputs, test_masks, test_token_type_ids)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)

In [None]:
# Compute predicted probabilities on the test set
all_prob = []
all_answer = []

label_folder_name = './final_github'

if not os.path.exists(label_folder_name):
    os.makedirs(label_folder_name)

best_model = bert_classifier
best_model.load_state_dict(torch.load(f"./final_github/best_model.pt"))
best_model.to(device)

probs = bert_predict(best_model, test_dataloader)

all_prob.extend(probs)

val_preds = np.argmax(probs, axis=1)

fold_metrics = {
    'predicted_probs 0': [lst[0] for lst in probs],
    'predicted_probs 1': [lst[1] for lst in probs],
    'predicted_probs 2': [lst[2] for lst in probs],
    'predicted_label': val_preds,
    'true_label': y_test
}

fold_df = pd.DataFrame(fold_metrics, columns=['predicted_probs 0', 'predicted_probs 1',
                                              'predicted_probs 2', 'predicted_label', 'true_label'])

label_file_path = os.path.join(label_folder_name, f'./label.csv')
fold_df.to_csv(label_file_path, index=False)

In [None]:
from sklearn.metrics import precision_score, recall_score, classification_report, confusion_matrix, accuracy_score
from sklearn import metrics

def calculate_metrics(y_true, y_pred):
    prediction = y_pred
    merged_list = y_true

    report = classification_report(merged_list, prediction, digits=4)
    matrix = confusion_matrix(merged_list, prediction)
    f1_score_micro = metrics.f1_score(merged_list, prediction, average='micro')
    f1_score_macro = metrics.f1_score(merged_list, prediction, average='macro')
    precision = precision_score(merged_list, prediction, average='macro')
    recall = recall_score(merged_list, prediction, average='macro')
    accuracy = accuracy_score(merged_list, prediction)

    tn = matrix[0, 0]
    tp = matrix[1, 1]
    fp = matrix[0, 1]
    fn = matrix[1, 0]
    specificity = tn / (fp + tn)
    NPV = tn / (fn + tn)
    PPV = tp / (fp + tp)
    
    results = {
        'report': report,
        'confusion_matrix': matrix,
        'f1_score_micro': f1_score_micro,
        'f1_score_macro': f1_score_macro,
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'specificity': specificity,
        'NPV': NPV,
        'PPV': PPV
    }
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1_macro: {f1_score_macro:.4f}")
    print(f"F1_micro: {f1_score_micro:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"Sensitivity: {recall:.4f}")
    print(f"PPV: {PPV:.4f}")
    print(f"NPV: {NPV:.4f}")
    print("=============================")
    print("Classification Report: \n", report)
    print("=============================")
    print("Confusion Matrix: \n", matrix)
    
    return results

metrics_results = calculate_metrics(y_test, val_preds)

# VOTING

In [None]:
import pandas as pd
MODEL1_PREDICTION = pd.read_csv("YOUR_MODEL_PREDICTION1.csv")
MODEL2_PREDICTION = pd.read_csv("YOUR_MODEL_PREDICTION2.csv")

In [None]:
# combine models
COMBINED_PREDICTION = pd.concat([MODEL1_PREDICTION, MODEL2_PREDICTION], axis=1)

# COUNT AVERAGE SCORE
VOTING_RESULT = COMBINED_PREDICTION.groupby(COMBINED_PREDICTION.columns, axis=1).mean()

# FIND THE MAX LABEL
VOTING_RESULT['pred_label'] = VOTING_RESULT.idxmax(axis=1)
VOTING_RESULT = VOTING_RESULT.assign(max_value=VOTING_RESULT.max(axis=1))
VOTING_RESULT