In [1]:
import torch
import datasets
import transformers

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import Dict
from torch.utils.data import Dataset
from captum.attr import visualization as viz

from captum.attr import (IntegratedGradients, LayerIntegratedGradients,
                         configure_interpretable_embedding_layer,
                         remove_interpretable_embedding_layer)
from transformers import (ElectraForSequenceClassification,
                          ElectraTokenizerFast, EvalPrediction, InputFeatures,
                          Trainer, TrainingArguments, glue_compute_metrics)

In [2]:
# Model and Tokenizer

model = ElectraForSequenceClassification.from_pretrained(
    "google/electra-small-discriminator", num_labels = 2)

tokenizer = ElectraTokenizerFast.from_pretrained(
    "google/electra-small-discriminator", do_lower_case=True)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

In [3]:
# Load the SST2 dataset from the datasets library
dataset = datasets.load_dataset("glue", "sst2")

# Look at the labels
print("Training set labels: {}".format(set(dataset["train"]["label"])))
print("Validation set labels: {}".format(set(dataset["validation"]["label"])))
print("Test set labels: {}".format(set(dataset["test"]["label"])))

# Explore the dataset
df = pd.DataFrame({"senence": dataset["train"]["sentence"],
                   "label": dataset["train"]["label"]})
pd.options.display.max_colwidth = 0
df.head()

Reusing dataset glue (/home/mirac13/.cache/huggingface/datasets/glue/sst2/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)


Training set labels: {0, 1}
Validation set labels: {0, 1}
Test set labels: {-1}


Unnamed: 0,senence,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates something rather beautiful about human nature,1
3,remains utterly satisfied to remain the same throughout,0
4,on the worst revenge-of-the-nerds clichés the filmmakers could dredge up,0


In [4]:
# Create Dataset class

class TrainerDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer

        self.tokenized_inputs = tokenizer(inputs, padding=True)   

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return InputFeatures(
            input_ids=self.tokenized_inputs['input_ids'][idx],
            token_type_ids=self.tokenized_inputs['token_type_ids'][idx],
            attention_mask=self.tokenized_inputs['attention_mask'][idx],
            label=self.targets[idx])

In [5]:
train_dataset = TrainerDataset(dataset["train"]["sentence"],
                               dataset["train"]["label"], tokenizer)
eval_dataset = TrainerDataset(dataset["validation"]["sentence"],
                              dataset["validation"]["label"], tokenizer)