# Prerequisite

## Install requirements

In [1]:
!pip install accelerate transformers datasets[vision] evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets[vision]
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggin

## Import requirements

In [None]:
from tqdm import tqdm
import gc
import os
import shutil
import glob
import random
import numpy as np
import torch
from torch import nn
import pandas as pd
from PIL import Image as PIL_Image
from PIL import ImageFile

from dataclasses import dataclass
from typing import Optional, Tuple, Union

import evaluate


from datasets import Dataset, Features, Value, Image, ClassLabel, concatenate_datasets
from transformers import AltCLIPProcessor, AltCLIPModel
from transformers.configuration_utils import PretrainedConfig
from transformers.models.altclip.modeling_altclip import AltCLIPEncoderLayer
from transformers.modeling_utils import PreTrainedModel
from transformers.modeling_outputs import BaseModelOutput, ModelOutput
from transformers import TrainingArguments, Trainer

## Set some parameters to have determinism

In [None]:
ImageFile.LOAD_TRUNCATED_IMAGES = True
PIL_Image.MAX_IMAGE_PIXELS = 933120000

GLOBAL_SEED = 10

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


# Prepare data

## Download

In [None]:
!wget -O train_trial.zip -c 'https://docs.google.com/uc?export=download&id=1byX4wpe1UjyCVyYrT04sW17NnycKAK7N&confirm=t'
!wget -O test_images.zip -c 'https://docs.google.com/uc?export=download&id=1rK7EskkEXzD59j5On-8orO5mIinQGUMW&confirm=t'
!wget -O test_data.zip -c 'https://docs.google.com/uc?export=download&id=10vDZsY0EhzvFFR8IF-3P_2ApOF0GIMML&confirm=t'

--2023-05-27 21:23:47--  https://docs.google.com/uc?export=download&id=1byX4wpe1UjyCVyYrT04sW17NnycKAK7N&confirm=t
Resolving docs.google.com (docs.google.com)... 74.125.24.113, 74.125.24.139, 74.125.24.138, ...
Connecting to docs.google.com (docs.google.com)|74.125.24.113|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-08-48-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/54qk1qsv81kiombnasuqgo3na4vft2j5/1685222625000/04589392675467887255/*/1byX4wpe1UjyCVyYrT04sW17NnycKAK7N?e=download&uuid=e3610b2f-27cd-4799-8b35-44ce4786321b [following]
--2023-05-27 21:23:47--  https://doc-08-48-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/54qk1qsv81kiombnasuqgo3na4vft2j5/1685222625000/04589392675467887255/*/1byX4wpe1UjyCVyYrT04sW17NnycKAK7N?e=download&uuid=e3610b2f-27cd-4799-8b35-44ce4786321b
Resolving doc-08-48-docs.googleusercontent.com (doc-08-48-docs.googleusercontent.com)... 172.217.194.132, 240

## Unzip and unify file names for convenience (Ignore disk warning)


In [None]:
!unzip train_trial.zip
# !rm train_trial.zip
!unzip test_images.zip
# !rm test_images.zip
!unzip test_data.zip -d test_data
# !rm test_data.zip
!mkdir semeval-2023-task-1-V-WSD-train-v1/test_v1
!mv test_data/en.test.data.v1.1.txt semeval-2023-task-1-V-WSD-train-v1/test_v1/en.test.data.v1.txt
!mv test_data/en.test.gold.v1.1.txt semeval-2023-task-1-V-WSD-train-v1/test_v1/en.test.gold.v1.txt
!mv test_data/it.test.data.v1.1.txt semeval-2023-task-1-V-WSD-train-v1/test_v1/it.test.data.v1.txt
!mv test_data/it.test.gold.v1.1.txt semeval-2023-task-1-V-WSD-train-v1/test_v1/it.test.gold.v1.txt
!mv test_data/fa.test.data.txt semeval-2023-task-1-V-WSD-train-v1/test_v1/fa.test.data.v1.txt
!mv test_data/fa.test.gold.txt semeval-2023-task-1-V-WSD-train-v1/test_v1/fa.test.gold.v1.txt
!rm -r test_data
!mv test_images semeval-2023-task-1-V-WSD-train-v1/test_v1/test_images_v1
!mv semeval-2023-task-1-V-WSD-train-v1 semeval-2023-task-1-V-WSD-v1

Archive:  train_trial.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of train_trial.zip or
        train_trial.zip.zip, and cannot find train_trial.zip.ZIP, period.
Archive:  test_images.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of test_images.zip or
        test_images.zip.zip, and cannot find test_images.zip.ZIP, period.
Archive:  test_data.zip
  inflating: test_data/en.test.data.v1.1.txt  
  inflating: test_data/en.test.gold.v1.1.txt  
  inflating: test_data/fa.test.data.tx

## Load datasets to pandas dataframe

In [None]:
def load_dataset_data_frame(parts):
    data_frames = []
    for part in parts:
        print(f"Loading {part} data...")
        df1 = pd.read_csv(
            f'semeval-2023-task-1-V-WSD-v1/{part if "test" not in part else "test"}_v1/{part}.data.v1.txt', 
            sep='\t', 
            header=None
        )
        df2 = pd.read_csv(
            f'semeval-2023-task-1-V-WSD-v1/{part if "test" not in part else "test"}_v1/{part}.gold.v1.txt', 
            sep='\t', 
            header=None
        )

        df1.rename(
            columns={
                0: 'word', 
                1: 'phrase', 
                2: 'image_0_name', 
                3: 'image_1_name', 
                4: 'image_2_name', 
                5: 'image_3_name', 
                6: 'image_4_name', 
                7: 'image_5_name', 
                8: 'image_6_name', 
                9: 'image_7_name', 
                10: 'image_8_name', 
                11: 'image_9_name'
            },
            inplace=True
        )
        df2.rename(
            columns={
                0: 'gold_name'
            },
            inplace=True
        )
        
        # TODO: Add columns "image_{i}" to "df1" which are images' path. (~6 lines)
        for i in range(10):
          df2.columns
        # end of TODO

        df = pd.merge(df1, df2, left_index=True, right_index=True)
        df[f'labels'] = df.apply(lambda x: [i for i in range(10) if x[f'image_{i}_name'] == x['gold_name']][0], axis=1)
        data_frames.append(df)
        print(f"Done; {part} data loaded ({len(df)} rows)!")
    return data_frames

train_df, trial_df, en_test_df, it_test_df, fa_test_df = load_dataset_data_frame(['train', 'trial', 'en.test', 'it.test', 'fa.test'])

Loading train data...


In [None]:
trial_df

Unnamed: 0,word,phrase,image_0_name,image_1_name,image_2_name,image_3_name,image_4_name,image_5_name,image_6_name,image_7_name,...,image_2,image_3,image_4,image_5,image_6,image_7,image_8,image_9,gold_name,labels
0,andromeda,andromeda tree,image.155.jpg,image.68.jpg,image.9.jpg,image.72.jpg,image.158.jpg,image.86.jpg,image.7.jpg,image.132.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.86.jpg,5
1,angora,angora city,image.5.jpg,image.52.jpg,image.96.jpg,image.70.jpg,image.46.jpg,image.91.jpg,image.76.jpg,image.139.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.70.jpg,3
2,anteater,marsupial anteater,image.147.jpg,image.16.jpg,image.107.jpg,image.135.jpg,image.93.jpg,image.59.jpg,image.88.png,image.131.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.107.jpg,2
3,bank,bank erosion,image.104.jpg,image.64.jpg,image.108.jpg,image.80.jpg,image.21.jpg,image.99.jpg,image.117.jpg,image.146.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.64.jpg,1
4,router,internet router,image.127.jpg,image.0.jpg,image.20.jpg,image.18.jpg,image.112.jpg,image.97.jpg,image.24.jpg,image.1.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.18.jpg,3
5,stick,centre stick,image.100.jpg,image.62.jpg,image.156.jpg,image.78.jpg,image.122.jpg,image.81.jpg,image.148.jpg,image.114.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.156.jpg,2
6,swing,swing hit,image.51.jpg,image.141.jpg,image.11.jpg,image.77.jpg,image.95.jpg,image.33.jpg,image.65.jpg,image.113.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.54.jpg,9
7,tube,london tube,image.105.jpg,image.129.jpg,image.41.jpg,image.43.jpg,image.102.jpg,image.28.jpg,image.79.jpg,image.138.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.28.jpg,5
8,venus,venus surface,image.60.jpg,image.37.jpg,image.83.jpg,image.94.jpg,image.17.jpg,image.29.jpg,image.32.jpg,image.137.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.124.jpg,9
9,wheel,breaking wheel,image.111.jpg,image.69.jpg,image.82.jpg,image.73.jpg,image.74.jpg,image.48.jpg,image.140.jpg,image.118.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.118.jpg,7


## Create HuggingFace dataset from pandas dataframe

In [None]:

# TODO: Specify each column datatype, refer to the output of the last cell (use "Features" class) (~1 line)
features = None
# end of TODO

train_dataset = Dataset.from_pandas(train_df, features=features).shuffle(seed=GLOBAL_SEED).select(range(496))
trial_dataset = Dataset.from_pandas(trial_df, features=features).shuffle(seed=GLOBAL_SEED) # Whole trial dataset
en_test_dataset = Dataset.from_pandas(en_test_df, features=features).shuffle(seed=GLOBAL_SEED).select(range(64))
it_test_dataset = Dataset.from_pandas(it_test_df, features=features).shuffle(seed=GLOBAL_SEED).select(range(64))
fa_test_dataset = Dataset.from_pandas(fa_test_df, features=features).shuffle(seed=GLOBAL_SEED).select(range(64))

In [None]:
trial_dataset

Dataset({
    features: ['word', 'phrase', 'image_0_name', 'image_1_name', 'image_2_name', 'image_3_name', 'image_4_name', 'image_5_name', 'image_6_name', 'image_7_name', 'image_8_name', 'image_9_name', 'image_0', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'image_6', 'image_7', 'image_8', 'image_9', 'gold_name', 'labels'],
    num_rows: 16
})

## Preprocess data using AltCLIP processor

In [None]:
def get_woaw(phrase, word) -> str:
    # TODO: Omit the word from phrase (~1 line)
    woaw = None
    # end of TODO
    return woaw

In [None]:
processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")

def process_function(examples):
    # TODO: Call processor and pass the pharse, WOAW (use "omit_ambiguous_word") and images as its arguments
    # Use return_tensors="pt", padding="max_length", truncation=True, max_length=64 (~1 line)
    processor_output = None
    # end of TODO

    processor_output['phrase_input_ids'] = processor_output['input_ids'][:1,:]
    processor_output['phrase_attention_mask'] = processor_output['attention_mask'][:1,:]
    processor_output['woaw_input_ids'] = processor_output['input_ids'][1:,:]
    processor_output['woaw_attention_mask'] = processor_output['attention_mask'][1:,:]
    processor_output.pop('input_ids')
    processor_output.pop('attention_mask')
    return processor_output

processed_train_dataset = train_dataset.map(process_function, writer_batch_size=32, cache_file_name='alt_train_dataset_cache')
processed_trial_dataset = trial_dataset.map(process_function, writer_batch_size=32, cache_file_name='alt_trial_dataset_cache')
processed_en_test_dataset = en_test_dataset.map(process_function, writer_batch_size=32, cache_file_name='alt_en_test_dataset_cache')
processed_it_test_dataset = it_test_dataset.map(process_function, writer_batch_size=32, cache_file_name='alt_it_test_dataset_cache')
processed_fa_test_dataset = fa_test_dataset.map(process_function, writer_batch_size=32, cache_file_name='alt_fa_test_dataset_cache')

Downloading (…)rocessor_config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/513 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Map:   0%|          | 0/496 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]



In [None]:
processed_trial_dataset

Dataset({
    features: ['word', 'phrase', 'image_0_name', 'image_1_name', 'image_2_name', 'image_3_name', 'image_4_name', 'image_5_name', 'image_6_name', 'image_7_name', 'image_8_name', 'image_9_name', 'image_0', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'image_6', 'image_7', 'image_8', 'image_9', 'gold_name', 'labels', 'pixel_values', 'phrase_input_ids', 'phrase_attention_mask', 'woaw_input_ids', 'woaw_attention_mask'],
    num_rows: 16
})

## Extract and save features using AltCLIP

In [None]:
datasets_dict = {
    'train': concatenate_datasets([processed_train_dataset, processed_trial_dataset]).shuffle(seed=GLOBAL_SEED),
    'en_test': processed_en_test_dataset,
    'it_test': processed_it_test_dataset,
    'fa_test': processed_fa_test_dataset
}

In [None]:
def get_batch(dataset, batch_size, i):
    # TODO: Return the i-th batch from dataset (~1 line)
    batch = None
    #end of TODO
    return batch

def remove_second_dim(tensor):
    # TODO: remove the second dim of tensor (~1 line)
    tensor = None
    # end of TODO
    return tensor

In [None]:
def extract_features(datasets_dict, batch_size=1):
    model = AltCLIPModel.from_pretrained("BAAI/AltCLIP").to('cuda:0')
    for processed_dataset_name, processed_dataset in datasets_dict.items():
        features_directory = f'alt_{processed_dataset_name}_features'
        if os.path.exists(features_directory):
            shutil.rmtree(features_directory, ignore_errors=True)
        os.mkdir(features_directory)
        processed_dataset.set_format(type='torch', columns=['phrase_input_ids', 'phrase_attention_mask', 'woaw_input_ids', 'woaw_attention_mask', 'pixel_values', 'labels'])
        for i in tqdm(range(0, processed_dataset.num_rows, batch_size)):
            batch = get_batch(processed_dataset, batch_size, i)[:]
            phrase_input_ids = remove_second_dim(batch['phrase_input_ids'])
            phrase_attention_mask = remove_second_dim(batch['phrase_attention_mask'])
            woaw_input_ids = remove_second_dim(batch['woaw_input_ids'])
            woaw_attention_mask = remove_second_dim(batch['woaw_attention_mask'])
            batch_size, image_count, image_dim0, image_dim1, image_dim2 = batch['pixel_values'].shape
            # TODO: Reshape batch['pixel_values'] to (batch_size * image_count, image_dim0, image_dim1, image_dim2) (~1 line)
            pixel_values = None
            # end of TODO

            # TODO: Move phrase_input_ids, phrase_attention_mask, woaw_input_ids, woaw_attention_mask, pixel_values to GPU (~5 lines)
            phrase_input_ids = None
            phrase_attention_mask = None
            woaw_input_ids = None
            woaw_attention_mask = None
            pixel_values = None
            # end of TODO
            
            torch.save(torch.unsqueeze(model.get_text_features(phrase_input_ids, phrase_attention_mask), dim=1), f'{features_directory}/phrase_features_{i}.pt')
            torch.save(torch.unsqueeze(model.get_text_features(woaw_input_ids, woaw_attention_mask), dim=1), f'{features_directory}/woaw_features_{i}.pt')
            torch.save(torch.reshape(model.get_image_features(pixel_values), (batch_size, image_count, -1)), f'{features_directory}/image_features_{i}.pt')
    
extract_features(datasets_dict)

Downloading (…)lve/main/config.json:   0%|          | 0.00/5.13k [00:00<?, ?B/s]

`text_config_dict` is provided which will be used to initialize `AltCLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


Downloading pytorch_model.bin:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

100%|██████████| 512/512 [07:10<00:00,  1.19it/s]
100%|██████████| 64/64 [00:54<00:00,  1.17it/s]
100%|██████████| 64/64 [00:54<00:00,  1.17it/s]
100%|██████████| 64/64 [00:54<00:00,  1.18it/s]


## Add exctracted features to HuggingFace datasets

**Restart Your Runtime Here to Avoid Out of Memory Issue!**

In [None]:
os.kill(os.getpid(), 9)

### Import requirements after restart again

In [None]:
from tqdm import tqdm
import gc
import os
import shutil
import glob
import random
import numpy as np
import torch
from torch import nn
import pandas as pd
from PIL import Image as PIL_Image
from PIL import ImageFile

from dataclasses import dataclass
from typing import Optional, Tuple, Union

import evaluate


from datasets import Dataset, Features, Value, Image, ClassLabel, concatenate_datasets
from transformers import AltCLIPProcessor, AltCLIPModel
from transformers.configuration_utils import PretrainedConfig
from transformers.models.altclip.modeling_altclip import AltCLIPEncoderLayer
from transformers.modeling_utils import PreTrainedModel
from transformers.modeling_outputs import BaseModelOutput, ModelOutput
from transformers import TrainingArguments, Trainer

### Set some parameters to have determinism after restart again

In [None]:
ImageFile.LOAD_TRUNCATED_IMAGES = True
PIL_Image.MAX_IMAGE_PIXELS = 933120000

GLOBAL_SEED = 10

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


### Load datasets from cache files

In [None]:
processed_train_dataset = Dataset.from_file('alt_train_dataset_cache')
processed_trial_dataset = Dataset.from_file('alt_trial_dataset_cache')
processed_en_test_dataset = Dataset.from_file('alt_en_test_dataset_cache')
processed_it_test_dataset = Dataset.from_file('alt_it_test_dataset_cache')
processed_fa_test_dataset = Dataset.from_file('alt_fa_test_dataset_cache')

### Add features to datasets

In [None]:
datasets_dict = {
    'train': concatenate_datasets([processed_train_dataset, processed_trial_dataset]).shuffle(seed=GLOBAL_SEED),
    'en_test': processed_en_test_dataset,
    'it_test': processed_it_test_dataset,
    'fa_test': processed_fa_test_dataset
}



In [None]:
def add_features_to_datasets(datasets_dict, batch_size=1):
    for processed_dataset_name, processed_dataset in datasets_dict.items():
        datasets_dict[processed_dataset_name] = datasets_dict[processed_dataset_name].remove_columns([
            'phrase_input_ids', 'phrase_attention_mask', 'woaw_input_ids', 'woaw_attention_mask', 'pixel_values'
        ])
        features_directory = f'alt_{processed_dataset_name}_features'
        phrase_features = []
        woaw_features = []
        image_features = []
        for i in tqdm(range(0, processed_dataset.num_rows, batch_size)):
            phrase_features.append(torch.load(f'{features_directory}/phrase_features_{i}.pt'))
            woaw_features.append(torch.load(f'{features_directory}/woaw_features_{i}.pt'))
            image_features.append(torch.load(f'{features_directory}/image_features_{i}.pt'))
        phrase_features = torch.cat(phrase_features, dim=0)
        woaw_features = torch.cat(woaw_features, dim=0)
        image_features = torch.cat(image_features, dim=0)

        shutil.rmtree(features_directory, ignore_errors=True)

        # TODO: Add phrase_features, woaw_features, image_features to datasets_dict[processed_dataset_name] (~3 lines)
        
        # end of TODO


add_features_to_datasets(datasets_dict)

100%|██████████| 512/512 [00:13<00:00, 39.35it/s] 


Flattening the indices:   0%|          | 0/512 [00:00<?, ? examples/s]

100%|██████████| 64/64 [00:00<00:00, 670.59it/s]
100%|██████████| 64/64 [00:00<00:00, 990.66it/s]
100%|██████████| 64/64 [00:00<00:00, 918.90it/s]


In [None]:
datasets_dict['train']

Dataset({
    features: ['word', 'phrase', 'image_0_name', 'image_1_name', 'image_2_name', 'image_3_name', 'image_4_name', 'image_5_name', 'image_6_name', 'image_7_name', 'image_8_name', 'image_9_name', 'image_0', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'image_6', 'image_7', 'image_8', 'image_9', 'gold_name', 'labels', 'image_features', 'phrase_features', 'woaw_features'],
    num_rows: 512
})

# Train and evaluate

## Construct model

### Model config class

In [None]:
class RAltCLIPConfig(PretrainedConfig):

    def __init__(
        self,
        hidden_size=768,
        intermediate_size=3072,
        num_hidden_layers=3,
        num_attention_heads=8,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-05,
        attention_dropout=0.0,
        num_images_to_rank=10,
        logit_scale_init_value=2.6592,
        loss_func="CE",
        **kwargs
    ):
        super().__init__(**kwargs)

        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.attention_dropout = attention_dropout
        self.hidden_act = hidden_act
        self.layer_norm_eps = layer_norm_eps
        self.num_images_to_rank = num_images_to_rank
        self.logit_scale_init_value = logit_scale_init_value
        self.loss_func = loss_func

### Encoder class

In [None]:
# Copied from transformers.models.clip.modeling_altclip.AltCLIPEncoder with AltCLIP->RAltCLIP
class RAltCLIPEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`AltCLIPEncoderLayer`].
    Args:
        config: AltCLIPConfig
    """

    def __init__(self, config: RAltCLIPConfig):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList([AltCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        r"""
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        encoder_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        hidden_states = inputs_embeds
        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            if self.gradient_checkpointing and self.training:

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs, output_attentions)

                    return custom_forward

                layer_outputs = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(encoder_layer),
                    hidden_states,
                    attention_mask,
                    causal_attention_mask,
                )
            else:
                layer_outputs = encoder_layer(
                    hidden_states,
                    attention_mask,
                    causal_attention_mask,
                    output_attentions=output_attentions,
                )

            hidden_states = layer_outputs[0]

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )

### Output data class

In [None]:
@dataclass
class RAltCLIPOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None

### Model class

In [None]:
class RAltCLIPModel(PreTrainedModel):
    """Relative AltCLIP"""

    config_class = RAltCLIPConfig
    
    def __init__(self, config: RAltCLIPConfig):
        super().__init__(config)
        self.rank_encoder = RAltCLIPEncoder(config)
        self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value)
        self.num_images_to_rank = config.num_images_to_rank
        self.loss_func = config.loss_func

    def forward(
        self,
        image_features: torch.FloatTensor,
        phrase_features: torch.FloatTensor,
        woaw_features: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
    ) -> RAltCLIPOutput:
        if woaw_features != None:
            phrase_image_embeds = torch.cat((phrase_features, image_features, woaw_features), dim=1)
        else:
            phrase_image_embeds = torch.cat((phrase_features, image_features), dim=1)

        # TODO: Pass phrase_image_embeds to rank_encoder and get sum of all hidden states (~2 lines)
        rank_encoder_output = None
        hidden_states_sum = None
        # end of TODO
        
        # normalized features
        hidden_states_sum = hidden_states_sum / hidden_states_sum.norm(p=2, dim=-1, keepdim=True)
        
        phrase_embeds_sum = hidden_states_sum[:,0,:].unsqueeze(1)
        image_embeds_sum = hidden_states_sum[:,1:11,:]
        
        # cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_phrase = torch.matmul(phrase_embeds_sum, torch.transpose(image_embeds_sum, -1, -2)) * logit_scale
        logits = logits_per_phrase.squeeze(1)
        
        loss = None
        if labels is not None:
            if self.loss_func == 'CE':
                output_loss_fct = nn.CrossEntropyLoss()
                loss = output_loss_fct(logits.view(-1, self.num_images_to_rank), labels.view(-1))
            elif self.loss_func == 'SIM':
                _, _, hidden_size = phrase_embeds_sum.shape
                similarity_loss_fct = nn.CosineEmbeddingLoss()
                similarity_labels = torch.nn.functional.one_hot(labels, self.num_images_to_rank)
                similarity_labels = torch.where(similarity_labels == 1, 1, -1)
                loss = similarity_loss_fct(
                    torch.repeat_interleave(phrase_embeds_sum, self.num_images_to_rank, dim=1).reshape(-1, hidden_size), 
                    image_embeds_sum.reshape(-1, hidden_size), 
                    similarity_labels.reshape(-1)
                )
            
        return RAltCLIPOutput(
            loss=loss,
            logits=logits,
        )

## Load evaluation metrics

In [None]:
accuracy_metric = evaluate.load('accuracy')
mrr_metric = evaluate.load('posicube/mean_reciprocal_rank')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    accuracy = accuracy_metric.compute(predictions=predictions.argmax(axis=-1), references=labels)
    mrr = mrr_metric.compute(predictions=np.where(np.argsort(-predictions, axis=1) == np.expand_dims(labels, axis=1))[1])
    return {'accuracy': accuracy['accuracy'], 'mrr': mrr['mrr']}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

## Train and evaluate

In [None]:
model_name = 'RAltCLIP_CE_12_T'

def model_init():
    config = RAltCLIPConfig(dropout=0.5, attention_dropout=0.5, loss_func='CE')
    model = RAltCLIPModel(config)
    return model

training_args = TrainingArguments(
    output_dir=model_name,
    logging_strategy='epoch',
    logging_steps=1,
    save_strategy='epoch',
    save_steps=1,
    metric_for_best_model='accuracy',
    per_device_train_batch_size=256,
    num_train_epochs=3,
    seed=GLOBAL_SEED,
    weight_decay=0.2
)

# TODO: Set the argumetns 
trainer = Trainer(
    model_init=None,
    args=None,
    train_dataset=None,
    compute_metrics=None,
)
# end of TODO

trainer.train()

eval_en_result = trainer.evaluate(datasets_dict['en_test'])
eval_it_result = trainer.evaluate(datasets_dict['it_test'])
eval_fa_result = trainer.evaluate(datasets_dict['fa_test'])

print(f"MODEL: {model_name}")
print(f"EN: Acc: {eval_en_result['eval_accuracy']}, MRR: {eval_en_result['eval_mrr']}")
print(f"IT: Acc: {eval_it_result['eval_accuracy']}, MRR: {eval_it_result['eval_mrr']}")
print(f"FA: Acc: {eval_fa_result['eval_accuracy']}, MRR: {eval_fa_result['eval_mrr']}")
print(f"OVERAL: Acc: {(eval_en_result['eval_accuracy']+eval_it_result['eval_accuracy']+eval_fa_result['eval_accuracy'])/3}, MRR: {(eval_en_result['eval_mrr']+eval_it_result['eval_mrr']+eval_fa_result['eval_mrr'])/3}")

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
2,1.3015
4,1.0605
6,0.9604


MODEL: RAltCLIP_CE_12_T
EN: Acc: 0.578125, MRR: 0.7354724702380954
IT: Acc: 0.28125, MRR: 0.4773809523809523
FA: Acc: 0.09375, MRR: 0.3039682539682539
OVERAL: Acc: 0.3177083333333333, MRR: 0.5056072255291005
