In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DebertaV2Tokenizer
from transformers import DebertaV2Model
import tqdm
from sklearn.model_selection import train_test_split
from torch.nn import functional as F
import torch.nn as nn

In [None]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
df = pd.read_csv("/storage/home/ndh5286/Projects/MOTN Transformer/final_model_6.2.24.csv", encoding = 'latin-1')
df = pd.DataFrame(df)
df = df.iloc[: , 1:]

np.random.seed(1337)

# Creates the dataframe
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['CASEID', 'comment_text', 'list']].copy()

# Applies float to list
new_df['list'] = new_df['list'].apply(lambda x: [float(i) for i in x])
new_df.head(10)

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8 #8
VALID_BATCH_SIZE = 4 #4
EPOCHS = 15
LEARNING_RATE = 1e-05

# Defining Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained("MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")

In [None]:
# Defining CustomDataset class
class CustomDataset(Dataset): # Inherits Dataset class from PyTorch

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.CASEID = self.data.CASEID
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            pad_to_max_length = True,
            return_token_type_ids = True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'caseid': self.CASEID[index],
            'text': comment_text,
            'ids': torch.tensor(ids, dtype = torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype = torch.long),
            'targets': torch.tensor(self.targets[index], dtype = torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network

# Split the dataset into train and test
train_size = 0.8
train_dataset = new_df.sample(frac = train_size, random_state = 200)
test_dataset = new_df.drop(train_dataset.index).reset_index(drop = True)
train_dataset = train_dataset.reset_index(drop = True)

# Split the train_dataset further into train and validation
train_split = 0.8
train_indices, val_indices = train_test_split(train_dataset.index, test_size = 1-train_split, random_state = 200)
train_split_dataset = train_dataset.loc[train_indices].reset_index(drop = True)
val_dataset = train_dataset.loc[val_indices].reset_index(drop = True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_split_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
print("VALIDATION Dataset: {}".format(val_dataset.shape))

# Create the datasets
training_set = CustomDataset(train_split_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)
validation_set = CustomDataset(val_dataset, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
validation_loader = DataLoader(validation_set, **test_params)

In [None]:
# Creating the customized model by adding dropout.

# class BERTClass(torch.nn.Module):
#     def __init__(self):
#         super(BERTClass, self).__init__()
#         self.l1 = transformers.RobertaModel.from_pretrained('roberta-base')
#         self.l2 = torch.nn.Dropout(0.2)
#         self.l3 = torch.nn.Linear(768, 13)  # Num of labels is 13
    
#     def forward(self, ids, mask, token_type_ids):
#         _, output_1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
#         output_2 = self.l2(output_1)
#         output = self.l3(output_2)
#         return output

class DEBERTAClass(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.l1 = DebertaV2Model.from_pretrained("MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
        self.dropout = nn.Dropout(0.3)
        self.layer_norm = nn.LayerNorm(self.l1.config.hidden_size)
        self.l2 = nn.Linear(self.l1.config.hidden_size, num_classes)

    def forward(self, ids, mask, token_type_ids):
        outputs = self.l1(ids, attention_mask=mask, token_type_ids = token_type_ids)
        last_hidden_state = outputs[0]  # Get the last hidden state
        
        # Pooling: Use the [CLS] token representation (first token)
        pooled_output = last_hidden_state[:, 0, :]
        
        output_2 = self.dropout(pooled_output)
        output_3 = self.layer_norm(output_2)
        output = self.l2(output_3)
        return output    
    
model = DEBERTAClass(13) # Change for number of classes estimating
model.to(device)

In [None]:
# Loss function
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
# Optimizer
optimizer = torch.optim.Adam(params = model.parameters(), lr = LEARNING_RATE)

In [None]:
# Defined training function
def train(epoch):
    model.train()
    for batch in tqdm.tqdm(training_loader, desc = f"Epoch {epoch}"):
        # Unpack the batch
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        
        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch}, Loss: {loss.item()}')

In [None]:
# Training Function
for epoch in range(EPOCHS):
    train(epoch)

In [None]:
# Validation Function
def validation(epoch):
    model.eval()
    fin_targets = []
    fin_outputs = []
    texts = []
    caseid = []
    
    with torch.no_grad():
        for batch in tqdm.tqdm(testing_loader, desc = f"Validation Epoch {epoch}"):
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            texts.extend(batch['text'])  # Extracting the text column
            caseid.extend(batch['caseid'])
    
    return fin_outputs, fin_targets, texts, caseid

# For Test Data
for epoch in range(1):
    outputs, targets, texts, caseid = validation(epoch)
    outputs = np.array(outputs) >= 0.65                                      #This can be tuned
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average = 'micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average = 'macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    #print("First three items of texts:", texts[:3])
    #print("First three items of fin_outputs:", outputs[:3])
    #print("First three items of fin_targets:", targets[:3])

In [None]:
# Validation Function
def validation(epoch):
    model.eval()
    fin_targets = []
    fin_outputs = []
    texts = []
    caseid = []
    
    with torch.no_grad():
        for batch in tqdm.tqdm(validation_loader, desc = f"Validation Epoch {epoch}"):
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            texts.extend(batch['text'])  # Extracting the text column
            caseid.extend(batch['caseid'])
    
    return fin_outputs, fin_targets, texts, caseid

# For Validation (Hold out) Data
for epoch in range(1):
    outputs, targets, texts, caseid = validation(epoch)
    outputs = np.array(outputs) >= 0.65                                      
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average = 'micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average = 'macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    #print("First three items of texts:", texts[:3])
    #print("First three items of fin_outputs:", outputs[:3])
    #print("First three items of fin_targets:", targets[:3])
    
    # Create a dictionary with column names as keys and lists as values
    final_df = {
        'Outputs': outputs,
        'Targets': targets,
        'Texts': texts,
        'Caseid': caseid
    }

In [None]:
# Creating final output
final_case = final_df['Caseid']
final_case = pd.DataFrame(final_case, columns = ["Caseid"])

final_text = final_df['Texts']
final_text = pd.DataFrame(final_text, columns = ["Text"])

final_output = final_df['Outputs']
final_output = pd.DataFrame(final_output, columns = ["Freedom and Rights", "E Plurabus Unum", "Representation of the People", "Popular Will and Equality", "National Identity and Heritage", "Not a Democracy a Republic", 
                "Flawed Democracy", "Institution and Constitution", "Unclassified/Other", "Don't Know", "Nothing/Disaffected", "Nothing More to Add", "NA"])
final_output.replace({True: 1, False: 0}, inplace = True)

final_df = final_text.join(final_output)
final_df = final_case.join(final_df)

In [None]:
final_df.head(10)

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

true_np = targets
pred_np = outputs

# Calculate confusion matrix
conf_matrix = multilabel_confusion_matrix(true_np, pred_np)

# Define label titles
label_titles = ["Freedom and Rights", "E Plurabus Unum", "Representation of the People", "Popular Will and Equality", "National Identity and Heritage", "Not a Democracy a Republic", 
                "Flawed Democracy", "Institution and Constitution", "Unclassified/Other", "Don't Know", "Nothing/Disaffected", "Nothing More to Add", "NA"]

# Create a dictionary to store confusion matrices with titles
conf_matrix_dict = {}
for i, title in enumerate(label_titles):
    conf_matrix_dict[title] = conf_matrix[i]

# Print confusion matrices with titles
for title, matrix in conf_matrix_dict.items():
    print(f"Confusion matrix for {title}:")
    print(matrix)

In [None]:
# torch.save(model, "/storage/home/ndh5286/Projects/MOTN Transformer/NLI_DeBERTaV2_transformer_7.18.24.pth")

In [None]:
# model = torch.load("/storage/home/ndh5286/Projects/MOTN Transformer/final_model_6.2.24.pth")