<a href="https://colab.research.google.com/github/nathan-barry/ai2-cartography-reimplementation/blob/main/data_visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets

In [None]:
import numpy as np
import pandas as pd
import torch
import time
import matplotlib.pyplot as plt
from transformers import ElectraForSequenceClassification, ElectraTokenizerFast, AdamW
from datasets import load_dataset

In [None]:
# Do same dataset preprocessing

# Load the dataset
snli_dataset = load_dataset("snli")

# Tokenizer
tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-small-discriminator")
print(tokenizer)

# Preprocess the dataset
def preprocess_data(batch, indices):
    encodings = tokenizer(batch['premise'], batch['hypothesis'], truncation=True, padding='max_length', max_length=128)
    labels = batch['label']
    encodings['labels'] = torch.tensor(labels, dtype=torch.long)
    encodings['index'] = indices 
    return encodings

def remove_unlabeled(example):
  return example['label'] != -1

# Filter out instances with -1 labels
filtered_train_dataset = snli_dataset["train"].filter(remove_unlabeled)
filtered_val_dataset = snli_dataset["validation"].filter(remove_unlabeled)

# Apply preprocessing
train_dataset = filtered_train_dataset.map(preprocess_data, with_indices=True, batched=True)
val_dataset = filtered_val_dataset.map(preprocess_data, with_indices=True, batched=True)

# Set the format as PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels", "index"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels", "index"])
print(train_dataset)

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
confidences = np.load(f"/content/drive/MyDrive/Colab Notebooks/NLP Final Project/Data/confidence_3.npy")
correctness = np.load(f"/content/drive/MyDrive/Colab Notebooks/NLP Final Project/Data/correctness_3.npy")

In [None]:
# Calculate confidence and variability
confidence_avg = confidences.mean(axis=-1)
correctness_avg = correctness.mean(axis=-1)
variability = confidences.std(axis=-1)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(variability, confidence_avg, alpha=0.5)
plt.xlabel("Variability")
plt.ylabel("Confidence")
plt.title("Data Map: Confidence vs Variability")
plt.grid(True)
plt.show()

In [None]:
num_samples = 1000
random_indices = np.random.choice(len(confidence_avg), num_samples, replace=False)

plt.figure(figsize=(10, 6))
plt.scatter(variability[random_indices], confidence_avg[random_indices], alpha=0.5)
plt.xlabel("Variability")
plt.ylabel("Confidence")
plt.title("Data Map: Confidence vs Variability (1000 random samples)")
plt.grid(True)
plt.show()

In [None]:
# Calculate the combined difficulty score
difficulty_score = confidence_avg + variability

# Sort the difficulty scores and get the indices
sorted_difficulty_indices = np.argsort(difficulty_score)

# Select the hardest 5% of examples
percentage_hardest = 5
num_hardest_examples = int(len(train_dataset) * percentage_hardest / 100)
hardest_indices = sorted_difficulty_indices[:num_hardest_examples]
hardest_examples = train_dataset.select(hardest_indices)

# Calculate the indices of the remaining examples
remaining_indices = sorted_difficulty_indices[num_hardest_examples:]

# Extract confidence and variability for the hardest and remaining examples
hardest_confidence = confidence_avg[hardest_indices]
hardest_variability = variability[hardest_indices]

remaining_confidence = confidence_avg[remaining_indices]
remaining_variability = variability[remaining_indices]

# Plot the hardest and remaining examples
num_samples = 1000
random_indices_rem = np.random.choice(len(remaining_variability), int(num_samples*(100 - percentage_hardest)/100), replace=False)
random_indices_har = np.random.choice(len(hardest_variability), int(num_samples*percentage_hardest/100), replace=False)

plt.scatter(remaining_variability[random_indices_rem], remaining_confidence[random_indices_rem], label='Remaining', alpha=0.5)
plt.scatter(hardest_variability[random_indices_har], hardest_confidence[random_indices_har], label='Hardest 5%', alpha=0.5, color='red')

plt.xlabel('Variability')
plt.ylabel('Confidence')
plt.title('Confidence vs Variability (Hardest 5%, 1000 random samples')
plt.legend()
plt.show()

In [None]:
import pandas as pd

# Create a DataFrame for the hardest examples
hardest_df = pd.DataFrame({
    'index': hardest_indices,
    'confidence': hardest_confidence,
    'variability': hardest_variability,
})

# Add the premise, hypothesis, and label columns from the dataset
hardest_df['premise'] = hardest_examples['premise']
hardest_df['hypothesis'] = hardest_examples['hypothesis']
hardest_df['label'] = hardest_examples['label']

# Save the DataFrame as a CSV file
hardest_df.to_csv('/content/drive/MyDrive/data_arrays/hardest_examples.csv', index=False)

In [None]:
print(hardest_df['label'].value_counts())
# 0: entailment
# 1: neutral
# 2: contradiction

In [None]:
# Load the CSV file into a pandas DataFrame
hardest_df = pd.read_csv('/content/drive/MyDrive/data_arrays/hardest_examples.csv')

In [None]:
hardest_df.head(20)

In [None]:
# Making the easiest dataset

# Calculate the combined difficulty score
difficulty_score = -confidence_avg + variability

# Sort the difficulty scores and get the indices
sorted_difficulty_indices = np.argsort(difficulty_score)

# Select the easiest 5% of examples
percentage_easiest = 45
num_easiest_examples = int(len(train_dataset) * percentage_easiest / 100)
easiest_indices = sorted_difficulty_indices[:num_easiest_examples]
easiest_examples = train_dataset.select(easiest_indices)

# Calculate the indices of the remaining examples
remaining_indices = sorted_difficulty_indices[num_easiest_examples:]

# Extract confidence and variability for the easiest and remaining examples
easiest_confidence = confidence_avg[easiest_indices]
easiest_variability = variability[easiest_indices]

remaining_confidence = confidence_avg[remaining_indices]
remaining_variability = variability[remaining_indices]

# Plot the easiest and remaining examples
num_samples = 1000
random_indices_easiest = np.random.choice(len(easiest_variability), int(num_samples * (percentage_easiest)/100), replace=False)
random_indices_rem = np.random.choice(len(remaining_variability), int(num_samples * (100-percentage_easiest)/100), replace=False)

plt.scatter(easiest_variability[random_indices_easiest], easiest_confidence[random_indices_easiest], label='Easiest 5%', alpha=0.5, color='red')
plt.scatter(remaining_variability[random_indices_rem], remaining_confidence[random_indices_rem], label='Remaining', alpha=0.5)

plt.xlabel('Variability')
plt.ylabel('Confidence')
plt.title('Confidence vs Variability (Easiest 45%, 1000 random samples)')
plt.legend()
plt.show()


In [None]:
import pandas as pd

# Create a DataFrame for the easiest examples
easiest_df = pd.DataFrame({
    'index': easiest_indices,
    'confidence': easiest_confidence,
    'variability': easiest_variability,
})

# Add the premise, hypothesis, and label columns from the dataset
easiest_df['premise'] = easiest_examples['premise']
easiest_df['hypothesis'] = easiest_examples['hypothesis']
easiest_df['label'] = easiest_examples['label']

# Save the DataFrame as a CSV file
easiest_df.to_csv('/content/drive/MyDrive/data_arrays/easiest_examples.csv', index=False)

In [None]:
easiest_df