# MOTN Transformer July 23rd 2024

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DebertaV2Tokenizer
from transformers import DebertaV2Model
import tqdm
from sklearn.model_selection import train_test_split

### 1. Check for GPU

In [3]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### 2. Import Data and Preprocess

In [4]:
df = pd.read_csv("/storage/home/ndh5286/Projects/MOTN Transformer/final_model_6.2.24.csv", encoding='latin-1')
df = pd.DataFrame(df)
df = df.iloc[: , 1:]

np.random.seed(1337)

# Creates the dataframe
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['CASEID', 'comment_text', 'list']].copy()

# Applies float to list
new_df['list'] = new_df['list'].apply(lambda x: [float(i) for i in x])
new_df.head(10)

Unnamed: 0,CASEID,comment_text,list
0,2061638667,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2056600635,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2058253621,Free nation where citizens elect their represe...,"[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,2058997303,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2058184341,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,2057930711,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,2058524165,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,2057837907,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,2058736151,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,2057900787,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### 3. Defining Key Variables and Tokenizer

In [5]:
# Defining some key variables
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8 #8
VALID_BATCH_SIZE = 4 #4
EPOCHS = 15
LEARNING_RATE = 1e-05

# Defining Tokenizer

tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")

### 4. Creating Dataset Object for Dataloader

In [6]:
# Defining CustomDataset class
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.CASEID = self.data.CASEID
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'caseid': self.CASEID[index],
            'text': comment_text,
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

### 5. Train/Val/Test Split

In [42]:
# Creating the dataset and dataloader for the neural network

# Split the dataset into train and test
train_size = 0.8
train_dataset = new_df.sample(frac=train_size, random_state=200)
val_dataset = new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

# Split the train_dataset further into train and validation
train_split = 0.8
train_indices, test_indices = train_test_split(train_dataset.index, test_size=1-train_split, random_state=200)
train_split_dataset = train_dataset.loc[train_indices].reset_index(drop=True)
test_dataset = train_dataset.loc[test_indices].reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_split_dataset.shape))
print("VALIDATION Dataset: {}".format(val_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

# Create the datasets
training_set = CustomDataset(train_split_dataset, tokenizer, MAX_LEN)
validation_set = CustomDataset(val_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (13987, 3)
TRAIN Dataset: (8952, 3)
VALIDATION Dataset: (2797, 3)
TEST Dataset: (2238, 3)


### 6. Setting Params and Creating Dataloader Object

In [43]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

train_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(validation_set, **val_params)
test_loader = DataLoader(testing_set, **test_params)

### 7. Defining Model with Extra Dropout and Normalization

In [44]:
# Creating the customized model by adding dropout

from torch.nn import functional as F
import torch.nn as nn

class DEBERTAClass(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.l1 = DebertaV2Model.from_pretrained('microsoft/deberta-v3-base')
        self.dropout = nn.Dropout(0.2)
        self.l2 = nn.Linear(self.l1.config.hidden_size, num_classes)

    def forward(self, ids, mask, token_type_ids):
        outputs = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs[0]  # Get the last hidden state
        
        # Pooling: Use the [CLS] token representation (first token)
        pooled_output = last_hidden_state[:, 0, :]
        
        output_2 = self.dropout(pooled_output)
        output = self.l2(output_2)
        return output    
    
model = DEBERTAClass(13)
model.to(device)

DEBERTAClass(
  (l1): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
              (dropo

### 8. Define Loss and Optimizer

In [45]:
# Loss function
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

# Optimizer
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

### 9. Define Training and Validation Functions

In [47]:
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    epoch_accuracy = 0
    
    for batch in tqdm.tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        ids = batch['ids'].to(device, dtype=torch.long)
        mask = batch['mask'].to(device, dtype=torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
        targets = batch['targets'].to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        output = model(ids, mask, token_type_ids)
        loss = loss_fn(output, targets)
        loss.backward()
        optimizer.step()
        
        acc = (output.argmax(dim=1) == targets.argmax(dim=1)).float().mean()
        epoch_accuracy += acc / len(train_loader)
        epoch_loss += loss.item() / len(train_loader)
    
    model.eval()
    epoch_val_accuracy = 0
    epoch_val_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            ids = batch['ids'].to(device, dtype=torch.long)
            mask = batch['mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.float)
            
            val_output = model(ids, mask, token_type_ids)
            val_loss = loss_fn(val_output, targets)
            
            acc = (val_output.argmax(dim=1) == targets.argmax(dim=1)).float().mean()
            epoch_val_accuracy += acc / len(val_loader)
            epoch_val_loss += val_loss.item() / len(val_loader)
    
    print(
        f"Epoch : {epoch + 1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f} - val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n"
    )

Epoch 1: 100%|██████████| 1119/1119 [02:23<00:00,  7.82it/s]


Epoch : 1 - loss : 0.1187 - acc: 0.7906 - val_loss : 0.0584 - val_acc: 0.8675



Epoch 2: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 2 - loss : 0.0576 - acc: 0.8851 - val_loss : 0.0478 - val_acc: 0.8771



Epoch 3: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 3 - loss : 0.0425 - acc: 0.9087 - val_loss : 0.0458 - val_acc: 0.8907



Epoch 4: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 4 - loss : 0.0320 - acc: 0.9253 - val_loss : 0.0445 - val_acc: 0.8911



Epoch 5: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 5 - loss : 0.0246 - acc: 0.9385 - val_loss : 0.0471 - val_acc: 0.8861



Epoch 6: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 6 - loss : 0.0194 - acc: 0.9440 - val_loss : 0.0503 - val_acc: 0.8907



Epoch 7: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 7 - loss : 0.0160 - acc: 0.9504 - val_loss : 0.0513 - val_acc: 0.8929



Epoch 8: 100%|██████████| 1119/1119 [02:23<00:00,  7.78it/s]


Epoch : 8 - loss : 0.0140 - acc: 0.9496 - val_loss : 0.0539 - val_acc: 0.8929



Epoch 9: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 9 - loss : 0.0119 - acc: 0.9544 - val_loss : 0.0552 - val_acc: 0.8968



Epoch 10: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 10 - loss : 0.0104 - acc: 0.9530 - val_loss : 0.0567 - val_acc: 0.8900



Epoch 11: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 11 - loss : 0.0098 - acc: 0.9546 - val_loss : 0.0614 - val_acc: 0.8896



Epoch 12: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 12 - loss : 0.0092 - acc: 0.9535 - val_loss : 0.0607 - val_acc: 0.8911



Epoch 13: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 13 - loss : 0.0094 - acc: 0.9516 - val_loss : 0.0583 - val_acc: 0.8907



Epoch 14: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 14 - loss : 0.0083 - acc: 0.9523 - val_loss : 0.0623 - val_acc: 0.8989



Epoch 15: 100%|██████████| 1119/1119 [02:23<00:00,  7.79it/s]


Epoch : 15 - loss : 0.0075 - acc: 0.9538 - val_loss : 0.0621 - val_acc: 0.8818



### 10. Accuracy Check on Test Dataset (Holdout)

In [53]:
# Validation Function
def validation(epoch):
    model.eval()
    fin_targets = []
    fin_outputs = []
    texts = []
    caseid = []
    
    with torch.no_grad():
        for batch in tqdm.tqdm(test_loader, desc=f"Validation Epoch"):
            ids = batch['ids'].to(device, dtype=torch.long)
            mask = batch['mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            texts.extend(batch['text'])  # Extracting the text column
            caseid.extend(batch['caseid'])
    
    return fin_outputs, fin_targets, texts, caseid

# For Validation (Hold out) Data
for epoch in range(1):
    outputs, targets, texts, caseid = validation(epoch)
    outputs = np.array(outputs) >= 0.65                                      #This can be tuned
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    #print("First three items of texts:", texts[:3])
    #print("First three items of fin_outputs:", outputs[:3])
    #print("First three items of fin_targets:", targets[:3])
    
    # Create a dictionary with column names as keys and lists as values
    final_df = {
        'Outputs': outputs,
        'Targets': targets,
        'Texts': texts,
        'Caseid': caseid
    }

Validation Epoch: 100%|██████████| 560/560 [00:11<00:00, 50.18it/s]


Accuracy Score = 0.8847184986595175
F1 Score (Micro) = 0.9143464399574921
F1 Score (Macro) = 0.7716799208281502


### 11. Creating Output Data for Inference

In [55]:
# Creating final output
final_case = final_df['Caseid']
final_case = pd.DataFrame(final_case, columns = ["Caseid"])

final_text = final_df['Texts']
final_text = pd.DataFrame(final_text, columns = ["Text"])

final_output = final_df['Outputs']
final_output = pd.DataFrame(final_output, columns = ["Freedom and Rights", "E Plurabus Unum", "Representation of the People", "Popular Will and Equality", "National Identity and Heritage", "Not a Democracy a Republic", 
                "Flawed Democracy", "Institution and Constitution", "Unclassified/Other", "Don't Know", "Nothing/Disaffected", "Nothing More to Add", "NA"])
final_output.replace({True: 1, False: 0}, inplace=True)

final_df = final_text.join(final_output)
final_df = final_case.join(final_df)

In [56]:
final_df.head(10)

Unnamed: 0,Caseid,Text,Freedom and Rights,E Plurabus Unum,Representation of the People,Popular Will and Equality,National Identity and Heritage,Not a Democracy a Republic,Flawed Democracy,Institution and Constitution,Unclassified/Other,Don't Know,Nothing/Disaffected,Nothing More to Add,NA
0,2056741823,__NA__,0,0,0,0,0,0,0,0,0,0,0,0,1
1,49666741,__NA__,0,0,0,0,0,0,0,0,0,0,0,0,1
2,228599999,The representation of all people in a society.,0,0,1,0,0,0,0,0,0,0,0,0,0
3,2058329003,__NA__,0,0,0,0,0,0,0,0,0,0,0,0,1
4,68956198,__NA__,0,0,0,0,0,0,0,0,0,0,0,0,1
5,222104515,__NA__,0,0,0,0,0,0,0,0,0,0,0,0,1
6,206325623,Freedom to choose your leaders ; freedom to mo...,1,0,0,1,0,0,0,0,0,0,0,0,0
7,2059352003,__NA__,0,0,0,0,0,0,0,0,0,0,0,0,1
8,2057934851,__NA__,0,0,0,0,0,0,0,0,0,0,0,0,1
9,2060349333,__NA__,0,0,0,0,0,0,0,0,0,0,0,0,1


### 12. Check Confusion Matrices

In [57]:
from sklearn.metrics import multilabel_confusion_matrix

true_np = targets
pred_np = outputs

# Calculate confusion matrix
conf_matrix = multilabel_confusion_matrix(true_np, pred_np)

# Define label titles
label_titles = ["Freedom and Rights", "E Plurabus Unum", "Representation of the People", "Popular Will and Equality", "National Identity and Heritage", "Not a Democracy a Republic", 
                "Flawed Democracy", "Institution and Constitution", "Unclassified/Other", "Don't Know", "Nothing/Disaffected", "Nothing More to Add", "NA"]

# Create a dictionary to store confusion matrices with titles
conf_matrix_dict = {}
for i, title in enumerate(label_titles):
    conf_matrix_dict[title] = conf_matrix[i]

# Print confusion matrices with titles
for title, matrix in conf_matrix_dict.items():
    print(f"Confusion matrix for {title}:")
    print(matrix)

Confusion matrix for Freedom and Rights:
[[1812   19]
 [  35  372]]
Confusion matrix for E Plurabus Unum:
[[2213    5]
 [  13    7]]
Confusion matrix for Representation of the People:
[[2181   13]
 [  14   30]]
Confusion matrix for Popular Will and Equality:
[[1817   26]
 [  44  351]]
Confusion matrix for National Identity and Heritage:
[[2230    0]
 [   3    5]]
Confusion matrix for Not a Democracy a Republic:
[[2211    0]
 [   1   26]]
Confusion matrix for Flawed Democracy:
[[2174   15]
 [  20   29]]
Confusion matrix for Institution and Constitution:
[[2153   23]
 [  17   45]]
Confusion matrix for Unclassified/Other:
[[2071   38]
 [  35   94]]
Confusion matrix for Don't Know:
[[2203    2]
 [   1   32]]
Confusion matrix for Nothing/Disaffected:
[[2175   21]
 [  26   16]]
Confusion matrix for Nothing More to Add:
[[2053   13]
 [  15  157]]
Confusion matrix for NA:
[[1247    0]
 [   4  987]]


In [54]:
# torch.save(model, "/storage/home/ndh5286/Projects/MOTN Transformer/DeBERTaV3_model_7.23.24.pth")

In [None]:
# model = torch.load("/storage/home/ndh5286/Projects/MOTN Transformer/final_model_6.2.24.pth")