In [2]:
# Import libraries

import pandas as pd
import numpy as np
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score, precision_score, recall_score
import logging


In [3]:
# Load data from csv files

df_gpt4 = pd.read_csv("/Users/maxschaffelder/Desktop/Thesis/Data/GPT3/data_GPT3_para6.csv")
df_human = pd.read_csv("/Users/maxschaffelder/Desktop/Thesis/Data/Human/data_human_para2.csv")

# Remove unnecessary columns
df_gpt4 = df_gpt4.drop(["prompt"], axis=1)

# Add labels for classification
df_gpt4["label"] = 1
df_human["label"] = 0

# Combine human and GPT datasets into two datasets, one for gpt3.5 and one for gpt4
df_4 = pd.concat([df_gpt4, df_human])

df_4.head()


Unnamed: 0.1,Unnamed: 0,content,label
0,0,Introduction \n\nFeminist Standpoint Theory is...,1
1,1,Mitigating gentrification: How creative design...,1
2,2,Introduction\n\nDrug use and abuse are signifi...,1
3,3,Introduction\n\nBharti Airtel is one of the le...,1
4,4,Introduction\n\nThe problem of induction is a ...,1


In [4]:
# Shuffle datasets

#df_4 = df_4.sample(frac=1, random_state=42)
df_4 = df_4.reset_index(drop=True)

#df_4.head()

In [5]:
# Separate data into paragraphs

def separate_paragraphs(df, essay_column_name):
    essays_by_paragraph = []

     # Create a list to store the parsed paragraphs and essay indices
    parsed_data = []

    # Iterate over the essays in the specified column
    for essay_index, essay in enumerate(df[essay_column_name]):
        
        # Separate the essay into paragraphs
        paragraphs = essay.split('\n')

        # Create a list of tuples with the essay index and paragraph text
        for paragraph in paragraphs:
            parsed_data.append((essay_index, paragraph))

    # Create a new DataFrame with the parsed paragraphs and essay indices
    parsed_df = pd.DataFrame(parsed_data, columns=['essay_index', 'paragraph'])

    # Merge the original DataFrame with the parsed DataFrame
    merged_df = df.merge(parsed_df, left_index=True, right_on='essay_index')

    # Drop the essay_index column as it is not needed anymore
    merged_df.drop('essay_index', axis=1, inplace=True)

    return merged_df


df_4 = separate_paragraphs(df_4, "content")
#df_4.head()


In [6]:
# Function to tokenize all data in the dataframe, by paragraph, making each entry a maximum of 512 tokens, 
# and adding padding if it's shorter

def tokenize_df(df):
    
    # import tokenizer
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 
    
    # initialize columns of new df
    tokenized_df_columns = ["tokenized text", "attention mask", "label"]
    
    # initialize new df for tokenized data
    tokenized_df = pd.DataFrame({col: [] for col in tokenized_df_columns})
    
    # temporary variable storing tokenized text
    combined_paragraph_tokens = []
    combined_paragraph_attention_masks = []
    
    # variable storing the current essay number in order to not combine different essays
    current_essay_nr = df["Unnamed: 0"][0] # initial value is first essay in list
    
    max_len = 512 # roberta model has a maximum input size of 512
    
    # Looping through original df (non-tokenized), getting paragraphs, labels, and essay number
    for paragraph, label, essay_nr in zip(df["paragraph"], df["label"], df["Unnamed: 0"]):

        # encoding the current paragraph
        paragraph_tokens = tokenizer.encode(paragraph) # tokens
        paragraph_attention_mask = tokenizer(paragraph)["attention_mask"] # attention mask
        
        # checking that 512 token length is not surpassed
        if len(combined_paragraph_tokens) + len(paragraph_tokens) <= max_len and essay_nr == current_essay_nr:
            #add new tokens
            combined_paragraph_tokens = combined_paragraph_tokens + paragraph_tokens 
            # add new attention maskss
            combined_paragraph_attention_masks = combined_paragraph_attention_masks + paragraph_attention_mask 
            
        # else if it would be too long, add entry and start new one
        elif len(combined_paragraph_tokens) + len(paragraph_tokens) > max_len:
            
            # add padding
            padding_amount = max_len - len(combined_paragraph_tokens)
            padding = [1 for i in range(padding_amount)]
            padding_attention_mask = [0 for i in range(padding_amount)]
            
            combined_paragraph_tokens = combined_paragraph_tokens + padding
            combined_paragraph_attention_masks = combined_paragraph_attention_masks + padding_attention_mask
            
            new_entry = {'tokenized text': combined_paragraph_tokens, "attention mask": combined_paragraph_attention_masks, 'label': label}
            tokenized_df.loc[len(tokenized_df)] = new_entry
            combined_paragraph_tokens = []
            combined_paragraph_attention_masks = []
            
        # else if a new essay has started
        elif essay_nr != current_essay_nr:
            
            # add padding
            padding_amount = max_len - len(combined_paragraph_tokens)
            padding = [1 for i in range(padding_amount)]
            padding_attention_mask = [0 for i in range(padding_amount)]
            
            combined_paragraph_tokens = combined_paragraph_tokens + padding
            combined_paragraph_attention_masks = combined_paragraph_attention_masks + padding_attention_mask
            
            new_entry = {'tokenized text': combined_paragraph_tokens, "attention mask": combined_paragraph_attention_masks, 'label': label}
            tokenized_df.loc[len(tokenized_df)] = new_entry
            combined_paragraph_tokens = []
            combined_paragraph_attention_masks = []
            current_essay_nr = essay_nr
           
    return tokenized_df


In [7]:
tokenized_df4 = tokenize_df(df_4)
#tokenized_df4.head()

  element = np.asarray(element)
Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors


In [8]:
# Remove rows where the list only contains 1s or starts with [0, 2] and contains only 1s afterwards
print(len(tokenized_df4))
tokenized_df4 = tokenized_df4.loc[~tokenized_df4['tokenized text'].apply(lambda x: all(e == 1 for e in x) or (len(x) >= 2 and x[:2] == [0, 2] and all(e == 1 for e in x[2:]))), :]

# Print the resulting DataFrame
len(tokenized_df4["tokenized text"])


751


658

In [None]:
# PREPARE DATA INPUT

# Split data into training, test, and validation datasets
train_data, test_data = train_test_split(tokenized_df4, test_size=0.2)#, random_state=5)
val_data, test_data = train_test_split(test_data, test_size=0.5)#, random_state=5)


# Convert pd dataframes into pytorch tensors

train_inputs = torch.tensor(train_data["tokenized text"].tolist())
train_masks = torch.tensor(train_data["attention mask"].tolist())
train_labels = torch.tensor(train_data["label"].tolist())


validation_inputs = torch.tensor(val_data["tokenized text"].tolist())
validation_masks = torch.tensor(val_data["attention mask"].tolist())
validation_labels = torch.tensor(val_data["label"].tolist())


# Create dataloader

batch_size = 16 # try different values

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
# TRAIN MODEL

# Create model 

logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

epochs = 3
total_steps = len(train_dataloader) * epochs
optimizer = optim.AdamW(model.parameters(), lr=0.000019999, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print_every = 100

# Training loop
for epoch in range(epochs):

    model.train()
    
    for step, batch in enumerate(train_dataloader):
        
        # Unpack batch into input tensors, attention masks, and labels
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Clear previously calculated gradients
        model.zero_grad()

        # Perform a forward pass through model and compute loss
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backpropagation
        loss.backward()

        # Clip the gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update weights
        optimizer.step()

        # Update learning rate
        scheduler.step()
      
    # Evaluate model after each iteration
    model.eval()
    eval_loss = 0.0
    for batch in validation_dataloader:

        # Unpack batch into input tensors, attention masks, and labels
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Disable gradient calculations
        with torch.no_grad():
            # Perform a forward pass through the model and compute the loss
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Accumulate the evaluation loss
            eval_loss += loss.item()

    # Compute and print average validation loss
    avg_eval_loss = eval_loss / len(validation_dataloader)

    print(f"Epoch {epoch+1}, Validation Loss: {avg_eval_loss}")
   



Epoch 1, Validation Loss: 0.0049404256045818325
Epoch 2, Validation Loss: 0.0009136170614510774
Epoch 3, Validation Loss: 0.0008539987960830331


In [None]:
# EVALUATE MODEL

# Evaluating the model with the test dataset

# Split data into training, test, validation dataset
#train_data, test_data = train_test_split(tokenized_df4, test_size=0.2, random_state=5)
#val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=5)

test_inputs = torch.tensor(test_data["tokenized text"].tolist())
test_masks = torch.tensor(test_data["attention mask"].tolist())
test_labels = torch.tensor(test_data["label"].tolist())

batch_size = 32 # try different values

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

model.eval()
correct_predictions = 0
total_predictions = 0
true_labels = []
predicted_labels = []


for batch in test_dataloader:

    # Unpack batch into input tensors, attention masks, and labels
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)

    # Disable gradient calculations
    with torch.no_grad():
      
        # Perform a forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get the logits and calculate the probabilities
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)

        # Obtain the predicted class
        predicted_class = torch.argmax(probabilities, dim=-1)

        # Compare the predicted class with the true labels and update the counters
        correct_predictions += (predicted_class == labels).sum().item()
        total_predictions += labels.size(0)

        # Append the true labels and predicted labels to their lists
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predicted_class.cpu().numpy())

accuracy = correct_predictions / total_predictions
f1 = f1_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)

accuracy_scores.append(accuracy)
f1_scores.append(f1)
precision_scores.append(precision)
recall_scores.append(recall)

print(f"Accuracy: {accuracy:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")

mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_f1 = sum(f1_scores) / len(f1_scores)
mean_precision = sum(precision_scores) / len(precision_scores)
mean_recall = sum(recall_scores) / len(recall_scores)

print(f"Mean accuracy over {len(accuracy_scores)} trials: {mean_accuracy}")
print(f"Mean F1 over {len(f1_scores)} trials: {mean_f1}")
print(f"Mean precision over {len(precision_scores)} trials: {mean_precision}")
print(f"Mean recall over {len(recall_scores)} trials: {mean_recall}")

Accuracy: 1.000
F1 Score: 1.000
Precision: 1.000
Recall: 1.000
Mean accuracy over 11 trials: 0.9917355371900828
Mean F1 over 11 trials: 0.9902853090730571
Mean precision over 11 trials: 0.983819016078471
Mean recall over 11 trials: 0.9972451790633609


In [None]:
# TRAIN MODEL MULTIPLE TIMES FOR COMPARISON

EPOCHS = 1
BATCH_SIZE = 16

# Initialize list to get mean in the end

accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

for i in range(50):

  # PREPARE DATA INPUT

  # Split data into training, test, and validation datasets
  train_data, test_data = train_test_split(tokenized_df4, test_size=0.3)#, random_state=5)
  val_data, test_data = train_test_split(test_data, test_size=0.5)#, random_state=5)


  # Convert pd dataframes into pytorch tensors

  train_inputs = torch.tensor(train_data["tokenized text"].tolist())
  train_masks = torch.tensor(train_data["attention mask"].tolist())
  train_labels = torch.tensor(train_data["label"].tolist())


  validation_inputs = torch.tensor(val_data["tokenized text"].tolist())
  validation_masks = torch.tensor(val_data["attention mask"].tolist())
  validation_labels = torch.tensor(val_data["label"].tolist())


  # Create dataloader

  batch_size = BATCH_SIZE # try different values

  train_data = TensorDataset(train_inputs, train_masks, train_labels)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  # TRAIN MODEL

  # Create model 

  model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

  epochs = EPOCHS
  total_steps = len(train_dataloader) * epochs
  optimizer = optim.AdamW(model.parameters(), lr=0.00002, eps=1e-8)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)

  print_every = 100

  # Training loop
  for epoch in range(epochs):

      model.train()
      
      for step, batch in enumerate(train_dataloader):
          
          # Unpack batch into input tensors, attention masks, and labels
          input_ids = batch[0].to(device)
          attention_mask = batch[1].to(device)
          labels = batch[2].to(device)

          # Clear previously calculated gradients
          model.zero_grad()

          # Perform a forward pass through model and compute loss
          outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
          loss = outputs.loss

          # Backpropagation
          loss.backward()

          # Clip the gradients to prevent exploding gradients
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

          # Update weights
          optimizer.step()

          # Update learning rate
          scheduler.step()
        
      # Evaluate model after each iteration
      model.eval()
      eval_loss = 0.0
      for batch in validation_dataloader:

          # Unpack batch into input tensors, attention masks, and labels
          input_ids = batch[0].to(device)
          attention_mask = batch[1].to(device)
          labels = batch[2].to(device)

          # Disable gradient calculations
          with torch.no_grad():
              # Perform a forward pass through the model and compute the loss
              outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
              loss = outputs.loss

              # Accumulate the evaluation loss
              eval_loss += loss.item()

      # Compute and print average validation loss
      avg_eval_loss = eval_loss / len(validation_dataloader)
      #print(f"Epoch {epoch+1}, Validation Loss: {avg_eval_loss}")
    

  # EVALUATE MODEL

  # Evaluating the model with the test dataset

  # Split data into training, test, validation dataset
  #train_data, test_data = train_test_split(tokenized_df4, test_size=0.2, random_state=5)
  #val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=5)

  test_inputs = torch.tensor(test_data["tokenized text"].tolist())
  test_masks = torch.tensor(test_data["attention mask"].tolist())
  test_labels = torch.tensor(test_data["label"].tolist())

  batch_size = 32 # try different values

  test_data = TensorDataset(test_inputs, test_masks, test_labels)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

  model.eval()
  correct_predictions = 0
  total_predictions = 0
  true_labels = []
  predicted_labels = []


  for batch in test_dataloader:

      # Unpack batch into input tensors, attention masks, and labels
      input_ids = batch[0].to(device)
      attention_mask = batch[1].to(device)
      labels = batch[2].to(device)

      # Disable gradient calculations
      with torch.no_grad():
        
          # Perform a forward pass through the model
          outputs = model(input_ids, attention_mask=attention_mask)

          # Get the logits and calculate the probabilities
          logits = outputs.logits
          probabilities = torch.softmax(logits, dim=-1)

          # Obtain the predicted class
          predicted_class = torch.argmax(probabilities, dim=-1)

          # Compare the predicted class with the true labels and update the counters
          correct_predictions += (predicted_class == labels).sum().item()
          total_predictions += labels.size(0)

          # Append the true labels and predicted labels to their lists
          true_labels.extend(labels.cpu().numpy())
          predicted_labels.extend(predicted_class.cpu().numpy())

  accuracy = correct_predictions / total_predictions
  f1 = f1_score(true_labels, predicted_labels)
  precision = precision_score(true_labels, predicted_labels)
  recall = recall_score(true_labels, predicted_labels)

  accuracy_scores.append(accuracy)
  f1_scores.append(f1)
  precision_scores.append(precision)
  recall_scores.append(recall)

  print("Iteration", i)
  print(f"Accuracy: {accuracy:.3f}")
  print(f"F1 Score: {f1:.3f}")
  print(f"Precision: {precision:.3f}")
  print(f"Recall: {recall:.3f}")
  print("#########################")

mean_accuracy = round(sum(accuracy_scores) / len(accuracy_scores), 4)
mean_f1 = round(sum(f1_scores) / len(f1_scores), 4)
mean_precision = round(sum(precision_scores) / len(precision_scores), 4)
mean_recall = round(sum(recall_scores) / len(recall_scores), 4)

print(f"Mean accuracy over {len(accuracy_scores)} trials: {mean_accuracy}")
print(f"Mean F1 over {len(f1_scores)} trials: {mean_f1}")
print(f"Mean precision over {len(precision_scores)} trials: {mean_precision}")
print(f"Mean recall over {len(recall_scores)} trials: {mean_recall}")

In [None]:
# LOOP TO ANALYZE VALIDATION LOSS WITH DIFFERENT LEARNING RATES

for i in range(10):
  # PREPARE DATA INPUT

  # Split data into training, test, and validation datasets
  train_data, test_data = train_test_split(tokenized_df4, test_size=0.2)#, random_state=5)
  val_data, test_data = train_test_split(test_data, test_size=0.5)#, random_state=5)


  # Convert pd dataframes into pytorch tensors

  train_inputs = torch.tensor(train_data["tokenized text"].tolist())
  train_masks = torch.tensor(train_data["attention mask"].tolist())
  train_labels = torch.tensor(train_data["label"].tolist())


  validation_inputs = torch.tensor(val_data["tokenized text"].tolist())
  validation_masks = torch.tensor(val_data["attention mask"].tolist())
  validation_labels = torch.tensor(val_data["label"].tolist())


  # Create dataloader

  batch_size = 16 # try different values

  train_data = TensorDataset(train_inputs, train_masks, train_labels)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  # TRAIN MODEL

  # Create model 

  logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
  model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

  epochs = 3
  total_steps = len(train_dataloader) * epochs
  optimizer = optim.AdamW(model.parameters(), lr=0.000019999, eps=1e-8)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)

  print_every = 100

  loss_list = []

  # Training loop
  for epoch in range(epochs):

      model.train()
      
      for step, batch in enumerate(train_dataloader):
          
          # Unpack batch into input tensors, attention masks, and labels
          input_ids = batch[0].to(device)
          attention_mask = batch[1].to(device)
          labels = batch[2].to(device)

          # Clear previously calculated gradients
          model.zero_grad()

          # Perform a forward pass through model and compute loss
          outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
          loss = outputs.loss

          # Backpropagation
          loss.backward()

          # Clip the gradients to prevent exploding gradients
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

          # Update weights
          optimizer.step()

          # Update learning rate
          scheduler.step()
        
      # Evaluate model after each iteration
      model.eval()
      eval_loss = 0.0
      for batch in validation_dataloader:

          # Unpack batch into input tensors, attention masks, and labels
          input_ids = batch[0].to(device)
          attention_mask = batch[1].to(device)
          labels = batch[2].to(device)

          # Disable gradient calculations
          with torch.no_grad():
              # Perform a forward pass through the model and compute the loss
              outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
              loss = outputs.loss

              # Accumulate the evaluation loss
              eval_loss += loss.item()

      # Compute and print average validation loss
      avg_eval_loss = eval_loss / len(validation_dataloader)
      loss_list.append(avg_eval_loss)


      #print(f"Epoch {epoch+1}, Validation Loss: {avg_eval_loss}")

  # printing out the total differences between the different validation losses after each epoch
  # logic: the higher the difference, the less the model overshoots --> loss does not bounce between higher and lower values
  print((loss_list[0]-loss_list[1]) + (loss_list[1] - loss_list[2]))#+ (loss_list[2] - loss_list[3]))
  if loss_list[0] < loss_list[1] or loss_list[1] < loss_list[2]: #or loss_list[2] < loss_list[3]:
    print("Loss increase!", loss_list)
    print("###############")
  else:
    print(loss_list)
    print("###############")
    



-0.01862779112998396
Loss increase! [0.0867863979190588, 0.09840309242717922, 0.10541418904904276]
###############
-0.09272562619298695
Loss increase! [0.036578174633905294, 0.14619192304089665, 0.12930380082689225]
###############
0.00770195815712213
Loss increase! [0.04279572172090411, 0.0033883882220834493, 0.03509376356378198]
###############
-0.00641638960223645
Loss increase! [0.08614101056009531, 0.08953061442589387, 0.09255740016233176]
###############


KeyboardInterrupt: ignored

In [None]:
sum(loss_list) / len(loss_list)

0.001654502523888368

In [None]:
# Save the model weights
model_save_path = "/content/drive/MyDrive/BSc_Thesis_Max_Schaffelder/Weights/Model_1"
#model.save_pretrained(model_save_path)
print(f"New model saved at {model_save_path}")

New model saved at /content/drive/MyDrive/BSc_Thesis_Max_Schaffelder/Weights/Model_1


In [None]:
# Function to classify a string input

def classify_string(input_text):
  tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 
  encoded_input = tokenizer.encode_plus(
      input_text,
      add_special_tokens=True,
      max_length=512,
      padding="max_length",
      truncation=True,
      return_tensors="pt",
  )

  input_ids = encoded_input["input_ids"].to(device)
  attention_mask = encoded_input["attention_mask"].to(device)
  model.eval()

  with torch.no_grad():
      outputs = model(input_ids, attention_mask=attention_mask)
  logits = outputs.logits
  probabilities = torch.softmax(logits, dim=-1)
  predicted_class = torch.argmax(probabilities, dim=-1).item()
  confidence = torch.max(probabilities).item()
  print(f"Predicted class: {predicted_class}, Confidence: {confidence:.2f}")


In [None]:
input_text = "Collective action is a fundamental component of social movements, which involves the coordinated efforts of individuals or groups to bring about social or political change. In contemporary China, where the rapid urbanization and commercialization have transformed urban spaces into consumption-oriented zones, people have turned to alternative forms of collective action to reclaim their right to the city. This essay explores the use of cultural and social capital as an alternative repertoire of collective action to reclaim space in the commercialized urban sphere in contemporary China."
classify_string(input_text)


Predicted class: 0, Confidence: 0.76


In [None]:
%%capture
!pip install ferret-xai
from ferret import Benchmark

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 
bench = Benchmark(model, tokenizer)
explanations = bench.explain(input_text, target=0)
evaluations = bench.evaluate_explanations(explanations, target=1)

#explanations
#bench.show_evaluation_table(evaluations)
bench.show_table(explanations)

Explainer:   0%|          | 0/6 [00:00<?, ?it/s]

Explanation eval:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,Collect,ive,Ġaction,Ġis,Ġa,Ġfundamental,Ġcomponent,Ġof,Ġsocial,Ġmovements,",",Ġwhich,Ġinvolves,Ġthe,Ġcoordinated,Ġefforts,Ġof.1,Ġindividuals,Ġor,Ġgroups,Ġto,Ġbring,Ġabout,Ġsocial.1,Ġor.1,Ġpolitical,Ġchange,.,ĠIn,Ġcontemporary,ĠChina,",.1",Ġwhere,Ġthe.1,Ġrapid,Ġurban,ization,Ġand,Ġcommercial,ization.1,Ġhave,Ġtransformed,Ġurban.1,Ġspaces,Ġinto,Ġconsumption,-,oriented,Ġzones,",.2",Ġpeople,Ġhave.1,Ġturned,Ġto.1,Ġalternative,Ġforms,Ġof.2,Ġcollective,Ġaction.1,Ġto.2,Ġreclaim,Ġtheir,Ġright,Ġto.3,Ġthe.2,Ġcity,..1,ĠThis,Ġessay,Ġexplores,Ġthe.3,Ġuse,Ġof.3,Ġcultural,Ġand.1,Ġsocial.2,Ġcapital,Ġas,Ġan,Ġalternative.1,Ġrepertoire,Ġof.4,Ġcollective.1,Ġaction.2,Ġto.4,Ġreclaim.1,Ġspace,Ġin,Ġthe.4,Ġcommercial.1,ized,Ġurban.2,Ġsphere,Ġin.1,Ġcontemporary.1,ĠChina.1,..2
Partition SHAP,0.01,0.03,0.02,0.0,0.01,0.01,0.01,0.01,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.02,0.01,0.01,0.01,0.02,0.01,0.01,0.0,0.0,0.01,0.01,0.01,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.01,0.0,0.0,0.01,0.01,0.01,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.03,0.03,0.03,0.02,0.02,0.01,0.01,0.01,0.01,-0.0,0.0,-0.0,-0.0,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.03,0.04,0.01,0.01,0.01,0.01,0.01,0.08
LIME,0.02,0.03,-0.01,-0.0,0.0,-0.0,-0.0,0.0,-0.01,-0.02,-0.01,0.0,-0.02,-0.01,-0.01,0.0,-0.0,-0.01,-0.01,-0.01,-0.0,-0.0,-0.0,-0.01,0.0,-0.0,-0.01,-0.03,0.01,0.0,-0.01,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.01,-0.01,-0.01,-0.01,-0.0,-0.0,-0.01,0.0,-0.0,-0.01,-0.02,-0.01,-0.01,-0.01,-0.01,-0.03,-0.01,-0.02,-0.01,-0.01,-0.01,-0.01,0.01,-0.0,0.0,-0.0,-0.01,0.04,0.13,-0.01,-0.01,-0.02,-0.01,-0.01,-0.0,-0.02,-0.0,-0.01,-0.01,-0.01,0.02,0.0,-0.01,-0.01,-0.01,-0.01,-0.0,-0.0,0.01,0.0,-0.01,-0.0,0.01,0.02,0.02,0.01,0.04
Gradient,0.01,0.0,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.0,0.01,0.01,0.01,0.0,0.01,0.01,0.01,0.01,0.01,0.0,0.01,0.01,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.0,0.01,0.01,0.0,0.01,0.01,0.01,0.01,0.0,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.0,0.01,0.01,0.0,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.03,0.1,0.04,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.03
Gradient (x Input),0.01,-0.0,-0.01,-0.01,-0.01,-0.01,-0.01,0.01,0.01,0.01,-0.01,-0.01,0.01,-0.0,0.01,0.0,-0.0,-0.01,-0.0,0.0,-0.0,0.01,0.01,0.0,-0.0,0.0,0.0,0.0,-0.03,0.0,-0.01,-0.0,-0.01,-0.01,0.0,0.0,-0.0,0.0,0.0,-0.0,0.01,0.01,0.0,0.0,0.01,-0.02,-0.01,0.0,-0.0,-0.01,0.01,-0.01,0.01,0.01,-0.01,-0.0,0.01,0.02,0.01,0.0,0.0,0.0,-0.01,0.0,0.01,0.01,-0.04,-0.04,-0.13,-0.0,0.0,-0.01,0.01,0.01,0.0,-0.0,-0.02,0.0,-0.01,0.01,-0.01,0.01,0.01,0.0,0.0,0.01,-0.0,-0.0,-0.01,0.0,-0.0,0.02,-0.0,-0.01,0.01,-0.02,-0.05
Integrated Gradient,0.0,-0.01,0.01,0.01,0.01,0.0,-0.01,-0.01,-0.01,-0.01,-0.0,0.01,-0.02,-0.01,-0.0,-0.02,-0.01,-0.01,-0.01,-0.03,-0.0,0.0,-0.0,-0.03,0.01,-0.01,0.01,-0.0,-0.03,-0.0,-0.01,-0.01,-0.0,0.0,-0.01,0.0,-0.01,0.01,-0.01,0.0,0.01,0.01,-0.0,0.01,-0.0,0.01,0.0,0.0,0.02,-0.0,0.01,0.01,-0.0,-0.02,0.01,-0.01,-0.01,0.01,-0.0,-0.01,-0.02,-0.01,-0.01,-0.01,0.01,0.01,-0.01,-0.04,-0.06,-0.04,-0.0,-0.01,0.0,-0.0,-0.02,-0.01,-0.01,0.04,0.02,0.01,0.0,-0.01,0.0,-0.01,-0.01,0.0,-0.01,-0.01,-0.01,0.01,-0.01,0.01,-0.0,-0.01,-0.01,0.01,-0.02
Integrated Gradient (x Input),0.06,0.01,0.0,-0.01,0.0,0.01,-0.01,0.01,0.02,0.02,0.02,0.0,0.02,-0.0,0.02,0.01,0.0,0.01,0.0,0.03,0.01,0.01,0.0,0.01,-0.02,0.01,0.01,0.01,-0.0,0.02,0.01,-0.0,-0.01,-0.02,-0.0,0.01,0.0,-0.02,-0.0,0.0,-0.01,0.0,0.0,0.0,-0.0,-0.02,0.0,0.0,0.01,-0.01,0.01,-0.02,-0.01,-0.01,-0.01,-0.01,-0.01,0.02,0.01,-0.01,0.0,0.0,-0.01,-0.02,-0.01,-0.0,-0.0,-0.03,-0.04,0.01,-0.01,0.01,-0.02,0.01,0.0,-0.01,0.01,-0.01,0.02,-0.0,-0.01,0.0,0.03,-0.0,-0.02,0.0,-0.0,-0.01,-0.02,0.01,-0.0,0.02,0.0,-0.01,0.01,-0.0,-0.03


In [None]:
bench.show_evaluation_table(evaluations)

In [None]:
input_text = "This is an example sentence."

tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 

# Using encode()
token_ids = tokenizer.encode(input_text)
encoded_input = tokenizer(input_text)

tokenizer.decode([0, 2, 0, 10462, 6, 8281, 3649, 14, 190, 114, 84, 5717, 32, 11359, 30, 26739, 5588, 8, 9186, 2433, 6, 51, 32, 45, 37771, 2368, 3030, 30, 106, 4, 91, 10648, 14, 2172, 64, 202, 33, 7017, 2640, 13, 49, 5717, 114, 49, 5717, 32, 9473, 39938, 877, 8, 1403, 12, 11847, 4, 2, 0, 2, 0, 33837, 8841, 2, 0, 2, 0, 133, 1050, 17874, 6, 215, 25, 16797, 8, 35638, 6, 67, 1693, 1142, 59, 5, 29988, 9, 36471, 1809, 19, 99, 52, 216, 59, 1050, 14766, 4, 1216, 17874, 3608, 14, 84, 5717, 8, 2163, 32, 11359, 30, 10, 3143, 9, 592, 6, 4106, 6, 8, 3039, 2433, 4, 2, 0, 2, 0, 10462, 6, 8281, 10648, 14, 190, 114, 84, 5717, 32, 11359, 30, 6731, 2433, 6, 51, 32, 45, 37771, 2368, 3030, 30, 106, 4, 91, 3649, 14, 2172, 33, 5, 476, 7, 4227, 15, 49, 3266, 8, 2807, 7, 1760, 11, 10753, 19, 106, 6, 61, 2386, 106, 7, 146, 5717, 14, 32, 45, 37771, 2368, 3030, 30, 6731, 2433, 4, 2, 0, 2, 0, 48984, 2, 0, 2, 0, 1121, 6427, 6, 5, 4286, 9, 481, 40, 34, 57, 19639, 13, 11505, 6, 8, 5, 36471, 1217, 9, 481, 40, 34, 57, 6835, 30, 26948, 1809, 4, 6903, 6, 5, 7404, 9, 7017, 2640, 1302, 7, 25696, 5, 240, 13, 23732, 50, 12754, 4620, 9, 1218, 50, 45832, 4, 635, 6, 1738, 8281, 34, 3751, 7, 27389, 36471, 1809, 19, 7017, 2640, 396, 9364, 7, 23732, 50, 12754, 4620, 9, 1218, 50, 45832, 4, 2, 0, 2, 0, 530, 1728, 10648, 14, 9473, 39938, 5073, 8, 1403, 12, 11847, 2163, 32, 2139, 13, 2172, 7, 33, 7017, 2640, 13, 49, 5717, 8, 2163, 4, 91, 3649, 14, 2172, 33, 5, 476, 7, 146, 5717, 14, 32, 45, 37771, 2368, 3030, 30, 6731, 2433, 6, 61, 2386, 106, 7, 33, 7017, 2640, 13, 49, 2163, 4, 2, 0, 2, 0, 5771, 8281, 17, 27, 29, 2120, 7, 27389, 36471, 1809, 19, 7017, 2640, 7700, 1142, 59, 5, 29988, 9, 42, 1217, 19, 99, 52, 216, 59, 1050, 14766, 11, 5, 2297, 2166, 6, 12243, 6, 8, 1050, 17874, 6, 39, 7576, 3608, 14, 2172, 33, 5, 476, 7, 146, 5717, 14, 32, 45, 37771, 2368, 3030, 30, 6731, 2433, 4, 2, 0, 2, 0, 28965, 6, 5, 864, 9, 549, 10, 36471, 1217, 9, 481, 40, 7980, 7017, 2640, 64, 28, 156, 37806, 4748, 396, 9364, 7, 23732, 50, 12754, 4620, 9, 1218, 50, 45832, 1189, 10, 5674, 9, 2625, 11, 10561, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

#encoded_input["attention_mask"]

