In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
cola_dataset = load_dataset("glue", "cola")
sst2_dataset = load_dataset("glue", "sst2")
mrpc_dataset = load_dataset("glue", "mrpc")
mnli_dataset = load_dataset("glue", "mnli")

# Sample 100 examples from each task
cola_sample = cola_dataset['train'].shuffle(seed=42).select(range(100))
sst2_sample = sst2_dataset['train'].shuffle(seed=42).select(range(100))
mrpc_sample = mrpc_dataset['train'].shuffle(seed=42).select(range(100))
mnli_sample = mnli_dataset['train'].shuffle(seed=42).select(range(100))

In [2]:
# Function to sample equal number of examples from each class
def sample_equal(dataset, label_column, num_samples_per_class):
    df = pd.DataFrame(dataset)
    df_sampled = df.groupby(label_column).apply(lambda x: x.sample(num_samples_per_class, replace=True)).reset_index(drop=True)
    return df_sampled

# Number of samples per class for a balanced dataset (75 from each class if binary classification)
samples_per_class = 450

# Sample equal number of examples from each class
cola_sample = sample_equal(cola_dataset['train'], 'label', samples_per_class)
sst2_sample = sample_equal(sst2_dataset['train'], 'label', samples_per_class)
mrpc_sample = sample_equal(mrpc_dataset['train'], 'label', samples_per_class)

# For MNLI, since it's a multi-class problem, we will take 50 from each class to make 150 samples
samples_per_class_mnli = 300
mnli_sample = sample_equal(mnli_dataset['train'], 'label', samples_per_class_mnli)

# Save the samples to CSV files
cola_sample.to_csv('cola_sample.csv', index=False)
sst2_sample.to_csv('sst2_sample.csv', index=False)
mrpc_sample.to_csv('mrpc_sample.csv', index=False)
mnli_sample.to_csv('mnli_sample.csv', index=False)

print("Samples saved successfully.")

Samples saved successfully.
