# Importing the Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Locating to Drive Location

In [4]:
%cd /content/drive/MyDrive/

/content/drive/MyDrive


# Present Working Directory

In [5]:
!pwd

/content/drive/MyDrive


# Installing Hugging face datasets and evaluate Library

In [7]:
!pip install datasets
!pip install evaluate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

# Importing all the relevant packages

In [8]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import (
    Data2VecTextModel,
    Data2VecVisionModel,
    Data2VecAudioModel,
    Data2VecTextForSequenceClassification,
    Data2VecVisionForImageClassification,
    Data2VecAudioForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoFeatureExtractor,
    AutoTokenizer,
    AutoProcessor,
    EvalPrediction
)
import evaluate
from torch import nn
import math
from typing import Dict, List, Optional, Union

# Method to Load pretrained data2vec model based on modality: Text, Vision and Speech from Hugging Face Repository

In [9]:
def load_pretrained_model(modality):
    if modality == "text":
        model_name = "facebook/data2vec-text-base"
        model = Data2VecTextModel.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        return model, tokenizer, None

    elif modality == "vision":
        model_name = "facebook/data2vec-vision-base"
        model = Data2VecVisionModel.from_pretrained(model_name)
        feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
        return model, None, feature_extractor

    elif modality == "speech":
        model_name = "facebook/data2vec-audio-base"
        model = Data2VecAudioModel.from_pretrained(model_name)
        processor = AutoProcessor.from_pretrained(model_name)
        return model, None, processor
    else:
        print("Invalid Modality")

# Method to load two dataset mentioned in the data2vec paper based on modality: Text, Vision and Speech. For Text Modality, SST-2 and MNLI from GLUE benchmark is Considered.

In [38]:
def load_datasets_from_paper(modality):
    if modality == "text":
        dataset1 = load_dataset("glue", "sst2")
        dataset2 = load_dataset("glue", "mnli")
        return dataset1, dataset2
    elif modality == "vision":
        dataset1 = load_dataset("cifar100")
        dataset2 = load_dataset("fengyang0317/imagenet-1k", split="val[:1000]")
        return dataset1, dataset2
    elif modality == "speech":
        dataset1 = load_dataset("librispeech_asr", "clean", split="test")
        dataset2 = {}
        return dataset1, dataset2
    else:
        print("Inavlid Modality")

# Method for Reproducing data2vec results on datasets from the paper for Text Modality. The Evaluate method use the pretrained model on a datasets from the paper.

In [31]:
def evaluate_on_dataset_for_text_modality(model, dataset, tokenizer=None, processor=None):
    if isinstance(dataset, dict) and 'dataset_name' in dataset:
        dataset_name = dataset['dataset_name']
        actual_dataset = dataset['data'] if 'data' in dataset else dataset
    else:
        dataset_info = str(dataset)
        actual_dataset = dataset
        if "sst2" in dataset_info.lower():
            dataset_name = "sst2"
        elif "mnli" in dataset_info.lower():
            dataset_name = "mnli"
    num_labels = 2 if "sst2" in dataset_name else 3

    classifier = Data2VecTextForSequenceClassification.from_pretrained(
        "facebook/data2vec-text-base",
        num_labels=num_labels
    )

    def preprocess_function(examples):
        return tokenizer(
            examples["sentence"] if "sentence" in examples else examples["premise"],
            examples["sentence2"] if "sentence2" in examples else None,
            truncation=True,
            padding="max_length",
            max_length=128
        )

    if "mnli" in dataset_name:
        eval_dataset = actual_dataset["validation_matched"]
    else:
        eval_dataset = actual_dataset["validation"]

    tokenized_dataset = eval_dataset.map(preprocess_function, batched=True)

    training_args = TrainingArguments(
        output_dir="./results",
        per_device_eval_batch_size=16,
        do_train=False,
        do_eval=True,
    )

    metric = evaluate.load("glue", dataset_name)

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model=classifier,
        args=training_args,
        eval_dataset=tokenized_dataset,
        compute_metrics=compute_metrics,
    )

    results = trainer.evaluate()

    print("Evaluation Accuracy {} on Dataset {}".format(dataset['dataset_name'],results['eval_accuracy']))

# Loading Model and SST2 and MNLI Dataset to reproduce results for text Modality

In [16]:
model, tokenizer, processor = load_pretrained_model("text")
dataset1, dataset2 = load_datasets_from_paper("text")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/data2vec-text-base were not used when initializing Data2VecTextModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing Data2VecTextModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Data2VecTextModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Data2VecTextModel were not initialized from the model checkpoint at facebook/data2vec-text-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference

tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

train-00000-of-00001.parquet:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

(…)alidation_matched-00000-of-00001.parquet:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

(…)dation_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

test_matched-00000-of-00001.parquet:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

test_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

# Reprodcing Result for Text Modality on SST2 GLUE TASK

In [30]:
dataset1 = dict(dataset1)
dataset1['dataset_name'] = 'sst2'
evaluate_on_dataset_for_text_modality(model, dataset1, tokenizer, processor)

Some weights of Data2VecTextForSequenceClassification were not initialized from the model checkpoint at facebook/data2vec-text-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Accuracy sst2 on Dataset 0.8491743119266054


# Reprodcing Result for Text Modality on MNLI GLUE TASK

In [32]:
dataset2 = dict(dataset2)
dataset2['dataset_name'] = 'mnli'
evaluate_on_dataset_for_text_modality(model, dataset2, tokenizer, processor)

Some weights of Data2VecTextForSequenceClassification were not initialized from the model checkpoint at facebook/data2vec-text-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Accuracy mnli on Dataset 0.8291743119266054


# Method for Reproducing data2vec results on datasets from the paper for Vision Modality. The Evaluate method use the pretrained model on a datasets from the paper.

In [57]:
def evaluate_on_dataset_for_vision_modality(model, dataset, tokenizer=None, processor=None):
    num_labels = 1000

    classifier = Data2VecVisionForImageClassification.from_pretrained(
        "facebook/data2vec-vision-base",
        num_labels=num_labels,
        ignore_mismatched_sizes=True
    )

    def preprocess_function(examples):
        image_key = 'img' if 'img' in examples else 'image'
        return processor(images=examples[image_key], return_tensors="pt")

    eval_dataset = dataset["test"] if "test" in dataset else dataset
    processed_dataset = eval_dataset.map(preprocess_function, batched=True)

    training_args = TrainingArguments(
        output_dir="./results",
        per_device_eval_batch_size=16,
        do_train=False,
        do_eval=True,
    )

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return {"accuracy": accuracy_score(labels, predictions)}

    trainer = Trainer(
        model=classifier,
        args=training_args,
        eval_dataset=processed_dataset,
        compute_metrics=compute_metrics,
    )

    results = trainer.evaluate()
    print(f"Evaluation Accuracy {results['eval_accuracy']}")

# Loading CIFAR and ImageNet and MNLI Dataset to reproduce results for text Modality

In [44]:
model, tokenizer, processor = load_pretrained_model("vision")
dataset1, dataset2 = load_datasets_from_paper("vision")

  return func(*args, **kwargs)


In [45]:
type(dataset2)

In [58]:
evaluate_on_dataset_for_vision_modality(model, dataset2, tokenizer, processor)

Some weights of Data2VecVisionForImageClassification were not initialized from the model checkpoint at facebook/data2vec-vision-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Accuracy 0.8125689
