# Load Essential Libraries

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BartTokenizer, AutoModelForSequenceClassification, AutoModel, TrainerCallback, TrainingArguments, EarlyStoppingCallback, Trainer, AdamW
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils.multiclass import unique_labels

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchaudio
print(torch.__version__)
print(torchvision.__version__)
print(torchaudio.__version__)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# BART

## Read data

In [None]:
df = pd.read_csv("/mnt/nas/NTCIR17/FinARG/Integrate_dataset/ECC_relation.csv")
df

In [None]:
texts = df[['text1','text2']]
label = df[['label']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(texts, label, test_size=0.2, stratify=label, random_state=42)

In [None]:
train_df = pd.concat((X_train, y_train), axis=1)
train_df

In [None]:
test_df = pd.concat((X_test, y_test), axis=1)
test_df

## turn into DATASET

In [None]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
SEED = 602
train_dataset = train_dataset.shuffle(SEED)

In [None]:
train_dataset

In [None]:
test_dataset

## TOKENIZE

In [None]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

In [None]:
MAX_LEN = 512

def tokenize_function(examples):
    text1 = examples["text1"]
    text2 = examples["text2"]
    inputs = tokenizer.encode_plus(text1, text2,
                                   max_length=MAX_LEN, padding="max_length",
                                   truncation=True)
                                   # truncation_strategy='only_first')
    if "label" in examples:
        inputs["labels"] = examples["label"]
    return inputs

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_function, remove_columns=["text1", "text2", "label"])
tokenized_test_dataset = test_dataset.map(tokenize_function, remove_columns=["text1", "text2", "label"])

In [None]:
tokenized_train_dataset

In [None]:
tokenized_test_dataset

In [None]:
# #Resampling
# from imblearn.over_sampling import RandomOverSampler
# from sklearn.datasets import make_classification
# from sklearn.model_selection import train_test_split
# from collections import Counter

# tokenized_train_dataset_df = pd.DataFrame(tokenized_train_dataset)
# tokenized_train_dataset_df = tokenized_train_dataset_df[['input_ids', 'attention_mask', 'labels']]
# X = np.array(tokenized_train_dataset_df['input_ids'].to_list()), np.array(tokenized_train_dataset_df['attention_mask'].to_list())
# X = np.concatenate(X, axis=1)
# y = tokenized_train_dataset_df['labels']
# # over
# ros = RandomOverSampler(random_state=42)
# X_resampled, y_resampled = ros.fit_resample(X, y)
# print("Resampled dataset shape:", Counter(y_resampled))

# input_ids = np.stack(X_resampled[:,:512])
# attention_mask = np.stack(X_resampled[:,512:])
# X_train = pd.DataFrame({
#     'input_ids': input_ids.tolist(),
#     'attention_mask': attention_mask.tolist()
# })
# y_train = np.array(y_resampled)
# y_train = pd.DataFrame(y_train, columns=['label'])
# tokenized_train_dataset = Dataset.from_dict({
#     "input_ids": X_train["input_ids"].to_numpy(),
#     "attention_mask": X_train["attention_mask"].to_numpy(),
#     "labels": y_train['label'].to_numpy()
# })
# print(f'tokenized_train_dataset:{tokenized_train_dataset}')
# print(f'tokenized_test_dataset:{tokenized_test_dataset}')

## DEFINE MODEL

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')
new_classification_head = nn.Sequential(
    nn.Linear(in_features=1024, out_features=512),
    nn.ReLU(),
    nn.Dropout(p=0.3),
    nn.Linear(in_features=512, out_features=3)
)
model.classification_head = new_classification_head
model.to(device)

## Training

In [None]:
def matrix_plt():
    labels = ["0", "1", "2"]
    cm = confusion_matrix(y_true, y_predict)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap=plt.cm.Blues)
    plt.savefig("matrix.png")
    plt.show()    
    return

def observing_loss():
    train_history = pd.DataFrame(trainer.state.log_history)
    train_loss = []
    eval_loss = []
    epoch = train_history['epoch'].unique().tolist()
    
    for e in epoch:
        loss_train = train_history[train_history['epoch'] == e]['loss'].dropna().tolist()
        loss_eval = train_history[train_history['epoch'] == e]['eval_loss'].dropna().tolist()
        
        # 添加缺失值填充
        if len(loss_train) == 0:
            loss_train = [np.nan]
        if len(loss_eval) == 0:
            loss_eval = [np.nan]
        
        train_loss.extend(loss_train)
        eval_loss.extend(loss_eval)
    
    history = pd.DataFrame()
    history['epoch'] = epoch
    history['train_loss'] = train_loss
    history['eval_loss'] = eval_loss
    history = history.dropna(subset=['eval_loss'])
    print(history)
    plt.plot(history['epoch'], history['train_loss'], label='Train Loss')
    plt.plot(history['epoch'], history['eval_loss'], label='Eval Loss')
    plt.legend()
    plt.title('Training and Evaluation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.savefig("loss.png")
    plt.show()
    return

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = logits[0].argmax(axis=-1)#for bart
    
    val_accuracy = accuracy_score(labels, predictions)
    macro_f1_score = f1_score(labels, predictions, average='macro')
    
    return {'val_accuracy': val_accuracy, 'macro_f1_score': macro_f1_score}

training_args = TrainingArguments(
                                  output_dir="./modelpoint",
                                  save_steps=-1,
                                  save_strategy='epoch',
                                  num_train_epochs=30,
                                  learning_rate=3e-8,
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  evaluation_strategy='epoch',
                                  eval_steps=-1,
                                  load_best_model_at_end=True,
                                  seed=616
                                 )
patience = 3
early_stopping = EarlyStoppingCallback(early_stopping_patience=patience)

class weight

In [None]:
# from torch import nn
# from transformers import Trainer

# class_counts = np.bincount(tokenized_train_dataset['labels'])
# class_weights = 1.0 / class_counts
# weights_tensor = torch.tensor(class_weights, device=device)
# loss_fn = nn.CrossEntropyLoss(weight=weights_tensor)

# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.get("labels")
#         # forward pass
#         outputs = model(**inputs)
#         logits = outputs.logits
#         # compute custom loss (suppose one has 3 labels with different weights)
#         loss_fn = nn.CrossEntropyLoss(weight=weights_tensor.float()) 
#         loss = loss_fn(logits.view(-1, self.model.config.num_labels), labels.view(-1))
#         return (loss, outputs) if return_outputs else loss

In [None]:
# class_weights

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
) #if u don't use class weight, just replace CustomTrainer to Trainer. 

trainer.train()

## Predict

In [None]:
predictions = trainer.predict(tokenized_test_dataset)

In [None]:
y_true = predictions.label_ids
y_predict = predictions.predictions[0].argmax(axis=-1) #for bart

prods = predictions.predictions[0]

print('Macro-average: {0}'.format(metrics.f1_score(y_true, y_predict, average = 'macro')))
print('Micro-average: {0}'.format(metrics.f1_score(y_true, y_predict, average = 'micro')))
print(classification_report(y_true, y_predict, digits=6))

print(metrics.confusion_matrix(y_true, y_predict))

In [None]:
matrix_plt()

In [None]:
answer = pd.DataFrame({'y_true': y_true, 'y_pred': y_predict})
answer

In [None]:
answer.to_csv("BartWithResam.csv", index=False)

In [None]:
observing_loss()

save the probs

In [None]:
logits = predictions[0][0]

In [None]:
probs = np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True)
probs

In [None]:
df_probs = pd.DataFrame(probs, columns=[0, 1, 2])
df_probs

In [None]:
df_probs.to_csv('./Bart8_2/BartWithResam_probs.csv', index=0)