# MOTN Transformer July 23rd 2024

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DebertaV2Tokenizer
from transformers import DebertaV2Model
import tqdm
from sklearn.model_selection import train_test_split

### 1. Check for GPU

In [None]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### 2. Import Data and Preprocess

In [None]:
df = pd.read_csv("/storage/home/ndh5286/Projects/MOTN Transformer/2021_2024.csv", encoding='latin-1')
df = pd.DataFrame(df)
df = df.iloc[: , 1:]

np.random.seed(1337)

# Creates the dataframe
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['CASEID', 'comment_text', 'list']].copy()

# Applies float to list
new_df['list'] = new_df['list'].apply(lambda x: [float(i) for i in x])
new_df.head(10)

### 3. Defining Key Variables and Tokenizer

In [None]:
# Defining some key variables
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8 #8
VALID_BATCH_SIZE = 4 #4
EPOCHS = 15
LEARNING_RATE = 1e-05

# Defining Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")

### 4. Creating Dataset Object for Dataloader

In [None]:
# Defining CustomDataset class
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.CASEID = self.data.CASEID
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            pad_to_max_length = True,
            return_token_type_ids = True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'caseid': self.CASEID[index],
            'text': comment_text,
            'ids': torch.tensor(ids, dtype = torch.long),
            'mask': torch.tensor(mask, dtype = torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype = torch.long),
            'targets': torch.tensor(self.targets[index], dtype = torch.float)
        }

### 5. Train/Val/Test Split

In [None]:
# Creating the dataset and dataloader for the neural network

# Split the dataset into train and test
train_val_size = 0.8
train_val_dataset, test_dataset = train_test_split(new_df, test_size = 1-train_val_size, random_state = 200)

# Now split the remaining data into train and validation
train_size = 0.75  # This will be 75% of 80% = 60% of total
train_dataset, val_dataset = train_test_split(train_val_dataset, test_size = 1-train_size, random_state = 200)

# Reset indices
train_dataset = train_dataset.reset_index(drop = True)
val_dataset = val_dataset.reset_index(drop = True)
test_dataset = test_dataset.reset_index(drop = True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALIDATION Dataset: {}".format(val_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

# Create the datasets
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
validation_set = CustomDataset(val_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

### 6. Setting Params and Creating Dataloader Object

In [None]:
params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

train_loader = DataLoader(training_set, **params)
val_loader = DataLoader(validation_set, **params)
test_loader = DataLoader(testing_set, **params)

### 7. Defining Model with Extra Dropout and Normalization

In [None]:
# Creating the customized model by adding dropout

from torch.nn import functional as F
import torch.nn as nn

class DEBERTAClass(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.l1 = DebertaV2Model.from_pretrained('microsoft/deberta-v3-base')
        self.dropout = nn.Dropout(0.2)
        self.l2 = nn.Linear(self.l1.config.hidden_size, num_classes)

    def forward(self, ids, mask, token_type_ids):
        outputs = self.l1(ids, attention_mask=mask, token_type_ids = token_type_ids)
        last_hidden_state = outputs[0]  # Get the last hidden state
        
        # Pooling: Use the [CLS] token representation (first token)
        pooled_output = last_hidden_state[:, 0, :]
        
        output_2 = self.dropout(pooled_output)
        output = self.l2(output_2)
        return output    
    
model = DEBERTAClass(10)
model.to(device)

### 8. Define Loss and Optimizer

In [None]:
# Loss function
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

# Optimizer
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

### 9. Define Training and Validation Functions

In [None]:
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    epoch_accuracy = 0
    
    for batch in tqdm.tqdm(train_loader, desc = f"Epoch {epoch + 1}"):
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.float)
        
        optimizer.zero_grad()
        output = model(ids, mask, token_type_ids)
        loss = loss_fn(output, targets)
        loss.backward()
        optimizer.step()
        
        acc = (output.argmax(dim=1) == targets.argmax(dim = 1)).float().mean()
        epoch_accuracy += acc / len(train_loader)
        epoch_loss += loss.item() / len(train_loader)
    
    model.eval()
    epoch_val_accuracy = 0
    epoch_val_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.float)
            
            val_output = model(ids, mask, token_type_ids)
            val_loss = loss_fn(val_output, targets)
            
            acc = (val_output.argmax(dim=1) == targets.argmax(dim=1)).float().mean()
            epoch_val_accuracy += acc / len(val_loader)
            epoch_val_loss += val_loss.item() / len(val_loader)
    
    print(
        f"Epoch : {epoch + 1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f} - val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n"
    )

### 10. Accuracy Check on Test Dataset (Holdout)

In [None]:
# Validation Function
def validation(epoch):
    model.eval()
    fin_targets = []
    fin_outputs = []
    texts = []
    caseid = []
    
    with torch.no_grad():
        for batch in tqdm.tqdm(test_loader, desc=f"Validation Epoch"):
            ids = batch['ids'].to(device, dtype=torch.long)
            mask = batch['mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            texts.extend(batch['text'])  # Extracting the text column
            caseid.extend(batch['caseid'])
    
    return fin_outputs, fin_targets, texts, caseid

# For Validation (Hold out) Data
for epoch in range(1):
    outputs, targets, texts, caseid = validation(epoch)
    outputs = np.array(outputs) >= 0.65                                      #This can be tuned
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average = 'micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average = 'macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    #print("First three items of texts:", texts[:3])
    #print("First three items of fin_outputs:", outputs[:3])
    #print("First three items of fin_targets:", targets[:3])
    
    # Create a dictionary with column names as keys and lists as values
    final_df = {
        'Outputs': outputs,
        'Targets': targets,
        'Texts': texts,
        'Caseid': caseid
    }

### 11. Creating Output Data for Inference

In [None]:
# Creating final output
final_case = final_df['Caseid']
final_case = pd.DataFrame(final_case, columns = ["Caseid"])

final_text = final_df['Texts']
final_text = pd.DataFrame(final_text, columns = ["Text"])

final_output = final_df['Outputs']
final_output = pd.DataFrame(final_output, columns = ["Freedom and Rights", "Not a Democracy a Republic", "Flawed Democracy", "Institution and Constitution", "Don't Know", "Nothing/Disaffected", 
                                                     "Nothing More to Add", "NA", "Unclassified", "Representation and Popular Will"])
final_output.replace({True: 1, False: 0}, inplace=True)

final_df = final_text.join(final_output)
final_df = final_case.join(final_df)

In [None]:
final_df.head(10)

### 12. Check Confusion Matrices

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

true_np = targets
pred_np = outputs

# Calculate confusion matrix
conf_matrix = multilabel_confusion_matrix(true_np, pred_np)

# Define label titles
label_titles = ["Freedom and Rights", "Not a Democracy a Republic", "Flawed Democracy", "Institution and Constitution", "Don't Know", "Nothing/Disaffected", 
                "Nothing More to Add", "NA", "Unclassified", "Representation and Popular Will"]

# Create a dictionary to store confusion matrices with titles
conf_matrix_dict = {}
for i, title in enumerate(label_titles):
    conf_matrix_dict[title] = conf_matrix[i]

# Print confusion matrices with titles
for title, matrix in conf_matrix_dict.items():
    print(f"Confusion matrix for {title}:")
    print(matrix)

In [None]:
# torch.save(model, "/storage/home/ndh5286/Projects/MOTN Transformer/DeBERTaV3_model_7.23.24.pth")

In [None]:
# model = torch.load("/storage/home/ndh5286/Projects/MOTN Transformer/final_model_6.2.24.pth")