In [1]:
import torch
import torch.nn as nn
import networkx as nx

from graphModels import *
import torch_geometric
from torch_geometric.utils import erdos_renyi_graph, to_networkx, from_networkx

import torch.nn.functional as F
import pandas as pd
import numpy as np

import re
pd.set_option('display.max_colwidth', None)

import math

from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import RobertaTokenizer, BertTokenizer, RobertaModel, BertModel, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.nn.utils.rnn import pad_sequence

import pprint
import time
import timeit


import random
import string

In [2]:
class Hi_Bert_Classification_Model_GCN(nn.Module):
    
    """ A Model for bert fine tuning, put an lstm on top of BERT encoding """

    def __init__(self, graph_type, num_class, device, adj_method, pooling_method='mean'):
        super(Hi_Bert_Classification_Model_GCN, self).__init__()
        self.graph_type = graph_type
        self.bert_path = 'bert-base-uncased'
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        
        # self.roberta = RobertaTokenizer.from_pretrained("roberta-base")


        self.lstm_layer_number = 2
        'default 128 and 32'
        self.lstm_hidden_size = 128
        self.hidden_dim = 32
        
        # self.bert_lstm = nn.Linear(768, self.lstm_hidden_size)
        self.device = device
        self.pooling_method=pooling_method

        self.mapping = nn.Linear(768, self.lstm_hidden_size).to(device)

        'start GCN'
        if self.graph_type == 'gcn':
            self.gcn = GCN(input_dim=self.lstm_hidden_size, hidden_dim=32, output_dim=num_class).to(device)
        elif self.graph_type == 'gat':
            self.gcn = GAT(input_dim=self.lstm_hidden_size, hidden_dim=32, output_dim=num_class).to(device)
        elif self.graph_type == 'graphsage':
            self.gcn = GraphSAGE(input_dim=self.lstm_hidden_size, hidden_dim=32, output_dim=num_class).to(device)
        elif self.graph_type == 'linear':
            self.gcn = LinearFirst(input_dim=self.lstm_hidden_size, hidden_dim=32, output_dim=num_class).to(device)
        elif self.graph_type == 'rank':
            self.gcn = SimpleRank(input_dim=self.lstm_hidden_size, hidden_dim=32, output_dim=num_class).to(device)
        elif self.graph_type == 'diffpool':
            self.gcn = DiffPool(self.device,max_nodes=10,input_dim=self.lstm_hidden_size, hidden_dim=32, output_dim=num_class).to(device)
        elif self.graph_type == 'hipool':
            self.gcn = HiPool(self.device,input_dim=self.lstm_hidden_size, hidden_dim=32, output_dim=num_class).to(device)
            
        self.adj_method = adj_method


    def forward(self, ids, mask, token_type_ids):

        # import pdb;pdb.set_trace()
        'encode bert'
        bert_ids = pad_sequence(ids).permute(1, 0, 2).long().to(self.device)
        bert_mask = pad_sequence(mask).permute(1, 0, 2).long().to(self.device)
        bert_token_type_ids = pad_sequence(token_type_ids).permute(1, 0, 2).long().to(self.device)
        batch_bert = []
        for emb_pool, emb_mask, emb_token_type_ids in zip(bert_ids, bert_mask, bert_token_type_ids):
            results = self.bert(emb_pool, attention_mask=emb_mask, token_type_ids=emb_token_type_ids)
            batch_bert.append(results[1])

        sent_bert = torch.stack(batch_bert, 0)
        'GCN starts'
        sent_bert = self.mapping(sent_bert)
        node_number = sent_bert.shape[1]
        

        'random, using networkx'
        if self.adj_method == 'random':
            generated_adj = nx.dense_gnm_random_graph(node_number, node_number)
        elif self.adj_method == 'er':
            generated_adj = nx.erdos_renyi_graph(node_number, node_number)
        elif self.adj_method == 'binom':
            generated_adj = nx.binomial_graph(node_number, p=0.5)
        elif self.adj_method == 'path':
            generated_adj = nx.path_graph(node_number)
        elif self.adj_method == 'complete':
            generated_adj = nx.complete_graph(node_number)
        elif self.adj_method == 'kk':
            generated_adj = kronecker_generator(node_number)
        elif self.adj_method == 'watts':
            if node_number-1 > 0:
                generated_adj = nx.watts_strogatz_graph(node_number, k=node_number-1, p=0.5)
            else:
                generated_adj = nx.watts_strogatz_graph(node_number, k=node_number, p=0.5)
        elif self.adj_method == 'ba':
            if node_number - 1>0:
                generated_adj = nx.barabasi_albert_graph(node_number, m=node_number-1)
            else:
                generated_adj = nx.barabasi_albert_graph(node_number, m=node_number)
        elif self.adj_method == 'bigbird':

            # following are attention edges
            attention_adj = np.zeros((node_number, node_number))
            global_attention_step = 2
            attention_adj[:, :global_attention_step] = 1
            attention_adj[:global_attention_step, :] = 1
            np.fill_diagonal(attention_adj,1) # fill diagonal with 1
            half_sliding_window_size = 1
            np.fill_diagonal(attention_adj[:,half_sliding_window_size:], 1)
            np.fill_diagonal(attention_adj[half_sliding_window_size:, :], 1)
            generated_adj = nx.from_numpy_matrix(attention_adj)

        else:
            generated_adj = nx.dense_gnm_random_graph(node_number, node_number)


        nx_adj = from_networkx(generated_adj)
        adj = nx_adj['edge_index'].to(self.device)

        'combine starts'
        # generated_adj2 = nx.dense_gnm_random_graph(node_number,node_number)
        # nx_adj = from_networkx(generated_adj)
        # adj = nx_adj['edge_index'].to(self.device)
        # nx_adj2 = from_networkx(generated_adj2)
        # adj2 = nx_adj2['edge_index'].to(self.device)
        # adj = torch.cat([adj2, adj], 1)
        'combine ends'

        if self.adj_method == 'complete':
            'complete connected'
            adj = torch.ones((node_number,node_number)).to_sparse().indices().to(self.device)

        if self.graph_type.endswith('pool'):
            'diffpool only accepts dense adj'
            adj_matrix = nx.adjacency_matrix(generated_adj).todense()
            adj_matrix = torch.from_numpy(np.asarray(adj_matrix)).to(self.device)
            adj = (adj,adj_matrix)
        # if self.args.graph_type == 'hipool':

        # sent_bert shape torch.Size([batch_size, 3, 768])
        gcn_output_batch = []
        for node_feature in sent_bert:
            # import pdb;pdb.set_trace()

            gcn_output=self.gcn(node_feature, adj)

            'graph-level read out, summation'
            gcn_output = torch.sum(gcn_output,0)
            gcn_output_batch.append(gcn_output)

        # import pdb;
        # pdb.set_trace()

        gcn_output_batch = torch.stack(gcn_output_batch, 0)

        'GCN ends'

        # import pdb;
        # pdb.set_trace()
        return gcn_output_batch,generated_adj # (batch_size, class_number)

In [3]:
load_path = '/scratch/smanduru/NLP/project/saved_models' + '/hipool_20eps.pth'

# Load the entire model
model = torch.load(load_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

model.to(device)

Using device: cuda


Hi_Bert_Classification_Model_GCN(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [4]:
class Preprocess:
    
    def __init__(self, testPath):
        
        self.test_df = pd.read_csv(testPath, sep = '\t', header=0)
        self.test_df['review'] = self.test_df['headline'].str.cat(self.test_df['text'], sep=' ')
        
    
    def clean_text(self, sentence):
        cleaned_sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence)
        cleaned_sentence = re.sub(r'\s+', ' ', cleaned_sentence).strip()
        return cleaned_sentence.lower()
        
    def get_clean(self):
        
        self.test_df['cleaned_text'] = self.test_df['review'].apply(self.clean_text)
        return self.test_df[['cleaned_text', 'label']]

In [5]:
pr = Preprocess("/scratch/smanduru/NLP/project/data/amazon_2048/amazon-books-2048-test.tsv")

test = pr.get_clean()

In [6]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


class CustomDataset(Dataset):
    
    def __init__(self, tokenizer, max_len, df, chunk_len=200, overlap_len=50, approach="all", max_size_dataset=None, min_len=249):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.overlap_len = overlap_len
        self.chunk_len = chunk_len
        self.approach = approach
        self.min_len = min_len
        self.max_size_dataset = max_size_dataset
        self.data, self.label = self.process_data(df)
        
    def process_data(self, df):
        self.num_class = len(set(df['label'].values))
        return df['cleaned_text'].values, df['label'].values
    
    def long_terms_tokenizer(self, data_tokenize, targets):
        long_terms_token = []
        input_ids_list = []
        attention_mask_list = []
        token_type_ids_list = []
        targets_list = []

        previous_input_ids = data_tokenize["input_ids"].reshape(-1)
        previous_attention_mask = data_tokenize["attention_mask"].reshape(-1)
        previous_token_type_ids = data_tokenize["token_type_ids"].reshape(-1)
        remain = data_tokenize.get("overflowing_tokens")
        targets = torch.tensor(targets, dtype=torch.int)
        
        start_token = torch.tensor([101], dtype=torch.long)
        end_token = torch.tensor([102], dtype=torch.long)

        total_token = len(previous_input_ids) -2 # remove head 101, tail 102
        stride = self.overlap_len - 2
        number_chunks = math.floor(total_token/stride)

        mask_list = torch.ones(self.chunk_len, dtype=torch.long)
        type_list = torch.zeros(self.chunk_len, dtype=torch.long)
        
        for current in range(number_chunks-1):
            input_ids = previous_input_ids[current*stride:current*stride+self.chunk_len-2]
            input_ids = torch.cat((start_token, input_ids, end_token))
            input_ids_list.append(input_ids)

            attention_mask_list.append(mask_list)
            token_type_ids_list.append(type_list)
            targets_list.append(targets)

        if len(input_ids_list) == 0:
            input_ids = torch.ones(self.chunk_len-2, dtype=torch.long)
            input_ids = torch.cat((start_token, input_ids, end_token))
            input_ids_list.append(input_ids)

            attention_mask_list.append(mask_list)
            token_type_ids_list.append(type_list)
            targets_list.append(targets)

        return({
            'ids': input_ids_list,
            'mask': attention_mask_list,
            'token_type_ids': token_type_ids_list,
            'targets': targets_list,
            'len': [torch.tensor(len(targets_list), dtype=torch.long)]
        })
    
    def __getitem__(self, idx):
        
        review = str(self.data[idx])
        targets = int(self.label[idx])
        data = self.tokenizer.encode_plus(
            review,
            max_length=self.max_len,
            pad_to_max_length=False,
            add_special_tokens=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_overflowing_tokens=True,
            return_tensors='pt')
        
        long_token = self.long_terms_tokenizer(data, targets)
        return long_token
    
    def __len__(self):
        return self.label.shape[0]

In [7]:
def my_collate1(batches):
    return [{key: torch.stack(value) for key, value in batch.items()} for batch in batches]

MAX_LEN = 1024
CHUNK_LEN = 200
OVERLAP_LEN = int(CHUNK_LEN/2)

TRAIN_BATCH_SIZE = 16
EPOCH = 20
lr=1e-5

test_dataset = CustomDataset(
    tokenizer = bert_tokenizer,
    max_len = MAX_LEN,
    chunk_len = CHUNK_LEN,
    overlap_len = OVERLAP_LEN,
    df = test)


test_loader = DataLoader(test_dataset,
                          batch_size = 32, 
                          shuffle = False, 
                          collate_fn = my_collate1)

In [8]:
def loss_fun(outputs, targets):
    loss = nn.CrossEntropyLoss()
    return loss(outputs, targets)

def eval_loop_fun1(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    losses = []
    for batch_idx, batch in enumerate(data_loader):
        ids = [data["ids"] for data in batch]  # size of 8
        mask = [data["mask"] for data in batch]
        token_type_ids = [data["token_type_ids"] for data in batch]
        targets = [data["targets"] for data in batch]  # length: 8

        with torch.no_grad():
            target_labels = torch.stack([x[0] for x in targets]).long().to(device)
            outputs, _ = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            loss = loss_fun(outputs, target_labels)
            losses.append(loss.item())

        fin_targets.append(target_labels.cpu().detach().numpy())
        fin_outputs.append(torch.softmax(outputs, dim=1).cpu().detach().numpy())
    return np.concatenate(fin_outputs), np.concatenate(fin_targets), losses

def evaluate(target, predicted):
    true_label_mask = [1 if (np.argmax(x)-target[i]) ==
                       0 else 0 for i, x in enumerate(predicted)]
    nb_prediction = len(true_label_mask)
    true_prediction = sum(true_label_mask)
    false_prediction = nb_prediction-true_prediction
    accuracy = true_prediction/nb_prediction
    return{
        "accuracy": accuracy,
        "nb exemple": len(target),
        "true_prediction": true_prediction,
        "false_prediction": false_prediction,
    }

In [9]:
model.eval()

Hi_Bert_Classification_Model_GCN(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [10]:
# Evaluate the test set
predicted_probs, true_labels, losses = eval_loop_fun1(test_loader, model, device)

# Evaluate accuracy
evaluation_result = evaluate(true_labels, predicted_probs)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  adj_matrix = nx.adjacency_matrix(generated_adj).todense()


In [11]:
evaluation_result

{'accuracy': 0.563,
 'nb exemple': 1000,
 'true_prediction': 563,
 'false_prediction': 437}

In [12]:
misclassified_examples = []
classified_examples = []
for i, (true_label, predicted_prob) in enumerate(zip(true_labels, predicted_probs)):
    predicted_label = np.argmax(predicted_prob)
    if predicted_label != true_label:
        misclassified_examples.append({
            'Example Index': i,
            'True Label': true_label,
            'Predicted Label': predicted_label,
            'Predicted Probabilities': predicted_prob.tolist(),
            'Raw Input': test.iloc[i]['cleaned_text']
        })
    else:
        classified_examples.append({
            'Example Index': i,
            'True Label': true_label,
            'Predicted Label': predicted_label,
            'Predicted Probabilities': predicted_prob.tolist(),
            'Raw Input': test.iloc[i]['cleaned_text']
        })

In [13]:
# Shuffle misclassified examples
np.random.shuffle(misclassified_examples)

# Print 5 randomly selected misclassified examples
for example in misclassified_examples[:5]:
    print(f"Example Index: {example['Example Index']}")
    print(f"True Label: {example['True Label']}, Predicted Label: {example['Predicted Label']}")
    print(f"Raw Input Sentence: {example['Raw Input']}")
    print("\n" + "="*50 + "\n")
    break

Example Index: 375
True Label: 1, Predicted Label: 3
Raw Input Sentence: a great idea that sadly missed the mark i used to work selling cosmetic ingredients meaning that i didn t work for any particular company but in selling ingredients via a broker to cosmetic chemists and scientists of major cosmetic corporations because of this i learned a wealth of information that i am not sure the author may have gained although i do believe she has good intentions the major discrepency i would point out at is that petroleum based products are her go to moisturizer petroleum jelly and petro based ingredients are very cheap for companies to use so it s been highly popular however it s been banned for use in cosmetics or skin care ingredients in europe because it s been shown to often carry carcinogens it does after all come from the ground it is actually currently going through fda retesting right now in the united states because of this google petrolatum cosmetic toxin data it takes waaaay longe

# Robustness - Jumbling

In [14]:
selected_examples = random.sample(classified_examples, min(10, len(classified_examples)))

In [15]:
indexes = [] # [587, 417, 364, 82, 311, 642, 872, 152, 287, 314] [944, 535, 909, 912, 55, 186, 297, 830, 541, 871]
for each in selected_examples:
    indexes.append(each['Example Index'])

indexes

# [7, 120, 586, 769, 63, 931, 322, 371, 51, 105] - Graph Sage

[189, 699, 468, 446, 863, 605, 132, 603, 38, 874]

In [16]:
robust_df = pd.DataFrame(selected_examples)

columns_to_drop = ['Predicted Label', 'Predicted Probabilities']
robust_df = robust_df.drop(columns=columns_to_drop)

In [17]:
# Jumbling function
def jumble_sentence(sentence):
    # Split the sentence into words
    words = sentence.split()
    
    # Jumble the words
    jumbled_words = random.sample(words, len(words))
    
    # Join the jumbled words back into a sentence
    jumbled_sentence = ' '.join(jumbled_words)
    
    return jumbled_sentence

robust_df['jumbled_sentence'] = robust_df['Raw Input'].apply(jumble_sentence)

In [18]:
robust_df.columns

Index(['Example Index', 'True Label', 'Raw Input', 'jumbled_sentence'], dtype='object')

In [19]:
robust_df = robust_df.rename(columns={'True Label': 'label'})
robust_df = robust_df.rename(columns={'jumbled_sentence': 'cleaned_text'})

In [20]:
robust_dataset = CustomDataset(
    tokenizer = bert_tokenizer,
    max_len = MAX_LEN,
    chunk_len = CHUNK_LEN,
    overlap_len = OVERLAP_LEN,
    df = robust_df[['cleaned_text', 'label']])


robust_loader = DataLoader(robust_dataset,
                          batch_size = 2, 
                          shuffle = False, 
                          collate_fn = my_collate1)

In [21]:
# Evaluate the test set
predicted_probs, true_labels, losses = eval_loop_fun1(robust_loader, model, device)

# Evaluate accuracy
evaluation_result = evaluate(true_labels, predicted_probs)

  adj_matrix = nx.adjacency_matrix(generated_adj).todense()


In [22]:
evaluation_result

{'accuracy': 0.8,
 'nb exemple': 10,
 'true_prediction': 8,
 'false_prediction': 2}

In [23]:
robust_misclassified_examples = []
robust_classified_examples = []
for i, (true_label, predicted_prob) in enumerate(zip(true_labels, predicted_probs)):
    predicted_label = np.argmax(predicted_prob)
    if predicted_label != true_label:
        robust_misclassified_examples.append({
            'Example Index': i,
            'True Label': true_label,
            'Predicted Label': predicted_label,
            'Predicted Probabilities': predicted_prob.tolist(),
            'Raw Input': test.iloc[i]['cleaned_text']
        })
    else:
        robust_classified_examples.append({
            'Example Index': i,
            'True Label': true_label,
            'Predicted Label': predicted_label,
            'Predicted Probabilities': predicted_prob.tolist(),
            'Raw Input': test.iloc[i]['cleaned_text']
        })

In [24]:
robust_misclassified_examples

[{'Example Index': 6,
  'True Label': 4,
  'Predicted Label': 0,
  'Predicted Probabilities': [0.977340579032898,
   0.0034738732501864433,
   0.0034738732501864433,
   0.0034738732501864433,
   0.012237735092639923],
  'Raw Input': 'a book that makes you think about things you should be thinking about on so many levels when i was in high school it was clearly stated in the west mesa student handbook that a student s participation in activities not to be confused with sports was to be limited to a set number of activities that could not exceed a certain number of points the idea was clearly to distribute opportunities to students at what was then the largest high school in the state of new mexico so large in fact the school was on split session with juniors and seniors attending from to noon before the afternoon shift showed up under these rules a student would not be able to be student body president co editor of the newspaper a member of the national honor society compete in speech a

# Robustness - Drop Some Words Randomly

In [25]:
robust_df = robust_df[['Example Index', 'label', 'Raw Input']]

In [26]:
def drop_words(sentence):
    words = sentence.split()
    
    # Randomly choose a percentage of words to drop (adjust as needed)
    percentage_to_drop = 0.3  # 30% of words will be dropped
    num_words_to_drop = int(len(words) * percentage_to_drop)
    
    # Randomly select words to drop
    words_to_drop = random.sample(words, num_words_to_drop)
    
    # Create a new sentence without the dropped words
    new_sentence = ' '.join(word for word in words if word not in words_to_drop)
    
    return new_sentence

# Create a new column 'Robust Input' by applying the drop_words function
robust_df['cleaned_text'] = robust_df['Raw Input'].apply(drop_words)

In [27]:
robust_dataset = CustomDataset(
    tokenizer = bert_tokenizer,
    max_len = MAX_LEN,
    chunk_len = CHUNK_LEN,
    overlap_len = OVERLAP_LEN,
    df = robust_df[['cleaned_text', 'label']])


robust_loader = DataLoader(robust_dataset,
                          batch_size = 2, 
                          shuffle = False, 
                          collate_fn = my_collate1)

# Evaluate the test set
predicted_probs, true_labels, losses = eval_loop_fun1(robust_loader, model, device)

# Evaluate accuracy
evaluation_result = evaluate(true_labels, predicted_probs)

evaluation_result

  adj_matrix = nx.adjacency_matrix(generated_adj).todense()


{'accuracy': 0.8,
 'nb exemple': 10,
 'true_prediction': 8,
 'false_prediction': 2}

In [28]:
robust_misclassified_examples = []
robust_classified_examples = []
for i, (true_label, predicted_prob) in enumerate(zip(true_labels, predicted_probs)):
    predicted_label = np.argmax(predicted_prob)
    if predicted_label != true_label:
        robust_misclassified_examples.append({
            'Example Index': i,
            'True Label': true_label,
            'Predicted Label': predicted_label,
            'Predicted Probabilities': predicted_prob.tolist(),
            'Raw Input': test.iloc[i]['cleaned_text']
        })
    else:
        robust_classified_examples.append({
            'Example Index': i,
            'True Label': true_label,
            'Predicted Label': predicted_label,
            'Predicted Probabilities': predicted_prob.tolist(),
            'Raw Input': test.iloc[i]['cleaned_text']
        })

In [29]:
robust_misclassified_examples

[{'Example Index': 5,
  'True Label': 4,
  'Predicted Label': 3,
  'Predicted Probabilities': [0.003727874020114541,
   0.003727874020114541,
   0.003727874020114541,
   0.8117092847824097,
   0.177107036113739],
  'Raw Input': 'worth its weight ten times over in pieces of eight on one side stood the devil captain who backed by the merchant and royal official held near dictorial powers that served captialism globally on the other side stood the relentlessly dangerous narural world the deep blue sea in this fine and scholarly work by buford rediker the reader will find a bottom up history of the common seaman focusing on roughly the early th century some what like howard zinn s peoples history some years later on the american worker the sailor tar experiences of that time pointed in many ways toward the industrial revolution to come in the domain of culture as in others seamen were enmeshed in the momentous transition in which the world came to be governed by capitalism and class it is 

# Robustness - Misspelled Words

In [30]:
robust_df = robust_df[['Example Index', 'label', 'Raw Input']]

In [31]:
def introduce_misspellings(sentence):
    words = sentence.split()

    # Randomly choose a percentage of words to misspell (adjust as needed)
    percentage_to_misspell = 0.05  # 20% of words will be misspelled
    num_words_to_misspell = int(len(words) * percentage_to_misspell)
    print(num_words_to_misspell, len(words))

    # Create a set of misspelled versions of the alphabet
    misspelled_alphabet = {
        'a': 'ae',
        'b': 'bf',
        'c': 'cd',
        'd': 'de',
        'e': 'ea',
        'f': 'fg',
        'g': 'gh',
        'h': 'hi',
        'i': 'ij',
        'j': 'jk',
        'k': 'kl',
        'l': 'lm',
        'm': 'mn',
        'n': 'no',
        'o': 'op',
        'p': 'pq',
        'q': 'qr',
        'r': 'rs',
        's': 'st',
        't': 'tu',
        'u': 'uv',
        'v': 'vw',
        'w': 'wx',
        'x': 'xy',
        'y': 'yz',
        'z': 'zx'
    }

    # Randomly select words to misspell
    words_to_misspell = random.sample(words, num_words_to_misspell)

    # Replace selected words with misspelled versions
    misspelled_words = [misspelled_alphabet.get(word[0], word) + ''.join(random.choice(string.ascii_lowercase) for _ in range(len(word) - 1)) for word in words_to_misspell]

    # Create a new sentence with misspelled words
    new_sentence = ' '.join(misspelled_alphabet.get(word[0], word) + ''.join(random.choice(string.ascii_lowercase) for _ in range(len(word) - 1)) if word in words_to_misspell else word for word in words)
    return new_sentence

# Create a new column 'Robust Input' by applying the introduce_misspellings function
robust_df['cleaned_text'] = robust_df['Raw Input'].apply(introduce_misspellings)

95 1912
121 2425
88 1776
120 2417
84 1687
141 2823
79 1598
92 1848
104 2099
82 1641


In [32]:
robust_dataset = CustomDataset(
    tokenizer = bert_tokenizer,
    max_len = MAX_LEN,
    chunk_len = CHUNK_LEN,
    overlap_len = OVERLAP_LEN,
    df = robust_df[['cleaned_text', 'label']])


robust_loader = DataLoader(robust_dataset,
                          batch_size = 2, 
                          shuffle = False, 
                          collate_fn = my_collate1)

# Evaluate the test set
predicted_probs, true_labels, losses = eval_loop_fun1(robust_loader, model, device)

# Evaluate accuracy
evaluation_result = evaluate(true_labels, predicted_probs)

evaluation_result

  adj_matrix = nx.adjacency_matrix(generated_adj).todense()


{'accuracy': 0.7,
 'nb exemple': 10,
 'true_prediction': 7,
 'false_prediction': 3}

In [33]:
robust_misclassified_examples = []
robust_classified_examples = []
for i, (true_label, predicted_prob) in enumerate(zip(true_labels, predicted_probs)):
    predicted_label = np.argmax(predicted_prob)
    if predicted_label != true_label:
        robust_misclassified_examples.append({
            'Example Index': i,
            'True Label': true_label,
            'Predicted Label': predicted_label,
            'Predicted Probabilities': predicted_prob.tolist(),
            'Raw Input': test.iloc[i]['cleaned_text']
        })
    else:
        robust_classified_examples.append({
            'Example Index': i,
            'True Label': true_label,
            'Predicted Label': predicted_label,
            'Predicted Probabilities': predicted_prob.tolist(),
            'Raw Input': test.iloc[i]['cleaned_text']
        })

In [34]:
robust_misclassified_examples

[{'Example Index': 3,
  'True Label': 4,
  'Predicted Label': 3,
  'Predicted Probabilities': [0.10665066540241241,
   0.05125516653060913,
   0.07145173102617264,
   0.4006871283054352,
   0.3699553608894348],
  'Raw Input': 'when does a fetus become a person fritz k beller robert f weir editorsthe beginning of human life dordrecht nl kluwer academic publishers pages isbn hardcover library of congress call number rg b papers from a conference in on the beginning of human life major divisions fetal development assisted reproduction technology contraception abortion fetal research fetal tissue maternal fetal relationship medical ethical legal aspects defective fetuses newborns the editors of this collection are well aware of the religious reasonsfor raising the question of the beginning of human life but they present scientific data as presently knownrather than metaphysical speculationsabout the beginning of each human being fertilization creates a unique strand of dna in the human egg