# Prerequisite

## Install requirements

In [1]:
!pip install accelerate transformers datasets[vision] evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets[vision]
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggi

## Import requirements

In [2]:
from tqdm import tqdm
import gc
import os
import shutil
import glob
import random
import numpy as np
import torch
from torch import nn
import pandas as pd
from PIL import Image as PIL_Image
from PIL import ImageFile

from dataclasses import dataclass
from typing import Optional, Tuple, Union

import evaluate


from datasets import Dataset, Features, Value, Image, ClassLabel, concatenate_datasets
from transformers import AltCLIPProcessor, AltCLIPModel
from transformers.configuration_utils import PretrainedConfig
from transformers.models.altclip.modeling_altclip import AltCLIPEncoderLayer
from transformers.modeling_utils import PreTrainedModel
from transformers.modeling_outputs import BaseModelOutput, ModelOutput
from transformers import TrainingArguments, Trainer

## Set some parameters to have determinism

In [3]:
ImageFile.LOAD_TRUNCATED_IMAGES = True
PIL_Image.MAX_IMAGE_PIXELS = 933120000

GLOBAL_SEED = 10

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


# Prepare data

## Download

In [4]:
!wget -O train_trial.zip -c https://cf-my.sharepoint.com/:u:/g/personal/camachocolladosj_cardiff_ac_uk/ERFsG4by92ZPuW1dQQGuLfcBzHifN-NX1tCL6s6g-9-RMw?download=1
!wget -O test_images.zip -c https://cf-my.sharepoint.com/:u:/g/personal/camachocolladosj_cardiff_ac_uk/ETXzWJCEdKJDtFluRbYGUGYBzfdnaeOuSwL5hiqCW-k38Q?download=1
!wget -O test_data.zip -c 'https://docs.google.com/uc?export=download&id=10vDZsY0EhzvFFR8IF-3P_2ApOF0GIMML&confirm=t'

--2023-05-17 07:18:47--  https://cf-my.sharepoint.com/:u:/g/personal/camachocolladosj_cardiff_ac_uk/ERFsG4by92ZPuW1dQQGuLfcBzHifN-NX1tCL6s6g-9-RMw?download=1
Resolving cf-my.sharepoint.com (cf-my.sharepoint.com)... 13.107.136.8, 13.107.138.8, 2620:1ec:8f8::8, ...
Connecting to cf-my.sharepoint.com (cf-my.sharepoint.com)|13.107.136.8|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /personal/camachocolladosj_cardiff_ac_uk/Documents/semeval-2023-task-1-V-WSD-train-v1.zip?ga=1 [following]
--2023-05-17 07:18:48--  https://cf-my.sharepoint.com/personal/camachocolladosj_cardiff_ac_uk/Documents/semeval-2023-task-1-V-WSD-train-v1.zip?ga=1
Reusing existing connection to cf-my.sharepoint.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 18353852513 (17G) [application/x-zip-compressed]
Saving to: ‘train_trial.zip’


2023-05-17 07:23:24 (63.5 MB/s) - ‘train_trial.zip’ saved [18353852513/18353852513]

--2023-05-17 07:23:24--  https://cf-my.sharepoint.com/

## Unzip and unify file names for convenience (Ignore disk warning)


In [5]:
!unzip train_trial.zip
!rm train_trial.zip
!unzip test_images.zip
!rm test_images.zip
!unzip test_data.zip -d test_data
!rm test_data.zip
!mkdir semeval-2023-task-1-V-WSD-train-v1/test_v1
!mv test_data/en.test.data.v1.1.txt semeval-2023-task-1-V-WSD-train-v1/test_v1/en.test.data.v1.txt
!mv test_data/en.test.gold.v1.1.txt semeval-2023-task-1-V-WSD-train-v1/test_v1/en.test.gold.v1.txt
!mv test_data/it.test.data.v1.1.txt semeval-2023-task-1-V-WSD-train-v1/test_v1/it.test.data.v1.txt
!mv test_data/it.test.gold.v1.1.txt semeval-2023-task-1-V-WSD-train-v1/test_v1/it.test.gold.v1.txt
!mv test_data/fa.test.data.txt semeval-2023-task-1-V-WSD-train-v1/test_v1/fa.test.data.v1.txt
!mv test_data/fa.test.gold.txt semeval-2023-task-1-V-WSD-train-v1/test_v1/fa.test.gold.v1.txt
!rm -r test_data
!mv test_images semeval-2023-task-1-V-WSD-train-v1/test_v1/test_images_v1
!mv semeval-2023-task-1-V-WSD-train-v1 semeval-2023-task-1-V-WSD-v1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: test_images/image.3011.jpg  
  inflating: __MACOSX/test_images/._image.3011.jpg  
  inflating: test_images/image.241.jpg  
  inflating: __MACOSX/test_images/._image.241.jpg  
  inflating: test_images/image.7277.jpg  
  inflating: __MACOSX/test_images/._image.7277.jpg  
  inflating: test_images/image.6169.jpg  
  inflating: __MACOSX/test_images/._image.6169.jpg  
  inflating: test_images/image.1606.jpg  
  inflating: __MACOSX/test_images/._image.1606.jpg  
  inflating: test_images/image.7263.jpg  
  inflating: __MACOSX/test_images/._image.7263.jpg  
  inflating: test_images/image.255.jpg  
  inflating: __MACOSX/test_images/._image.255.jpg  
  inflating: test_images/image.1612.jpg  
  inflating: __MACOSX/test_images/._image.1612.jpg  
  inflating: test_images/image.5474.jpg  
  inflating: __MACOSX/test_images/._image.5474.jpg  
  inflating: test_images/image.3005.jpg  
  inflating: __MACOSX/test_images/._image.

## Load datasets to pandas dataframe

In [14]:
def make_image_list():
  return [f"image_{i}_name" for i in range(10)] 

def add_image_path(row, part):
  image_names = list(row[make_image_list()])
  for i,image_name in enumerate(image_names): 
    row[f"image_{i}"] = f'semeval-2023-task-1-V-WSD-v1/{part if "test" not in part else "test"}_v1/{part if "test" not in part else "test"}_images_v1/{image_name}'
  return row

def load_dataset_data_frame(parts):
    data_frames = []
    for part in parts:
        print(f"Loading {part} data...")
        df1 = pd.read_csv(
            f'semeval-2023-task-1-V-WSD-v1/{part if "test" not in part else "test"}_v1/{part}.data.v1.txt', 
            sep='\t', 
            header=None
        )
        df2 = pd.read_csv(
            f'semeval-2023-task-1-V-WSD-v1/{part if "test" not in part else "test"}_v1/{part}.gold.v1.txt', 
            sep='\t', 
            header=None
        )

        df1.rename(
            columns={
                0: 'word', 
                1: 'phrase', 
                2: 'image_0_name', 
                3: 'image_1_name', 
                4: 'image_2_name', 
                5: 'image_3_name', 
                6: 'image_4_name', 
                7: 'image_5_name', 
                8: 'image_6_name', 
                9: 'image_7_name', 
                10: 'image_8_name', 
                11: 'image_9_name'
            },
            inplace=True
        )
        df2.rename(
            columns={
                0: 'gold_name'
            },
            inplace=True
        )
        
        # TODO: Add columns "image_{i}" to "df1" which are images' path. (~6 lines)
        df1 = df1.apply(lambda x : add_image_path(x, part), axis=1)
        # end of TODO

        df = pd.merge(df1, df2, left_index=True, right_index=True)
        df[f'labels'] = df.apply(lambda x: [i for i in range(10) if x[f'image_{i}_name'] == x['gold_name']][0], axis=1)
        data_frames.append(df)
        print(f"Done; {part} data loaded ({len(df)} rows)!")
    return data_frames

train_df, trial_df, en_test_df, it_test_df, fa_test_df = load_dataset_data_frame(['train', 'trial', 'en.test', 'it.test', 'fa.test'])

Loading train data...
Done; train data loaded (12869 rows)!
Loading trial data...
Done; trial data loaded (16 rows)!
Loading en.test data...
Done; en.test data loaded (463 rows)!
Loading it.test data...
Done; it.test data loaded (305 rows)!
Loading fa.test data...
Done; fa.test data loaded (200 rows)!


In [15]:
trial_df.head()

Unnamed: 0,word,phrase,image_0_name,image_1_name,image_2_name,image_3_name,image_4_name,image_5_name,image_6_name,image_7_name,...,image_2,image_3,image_4,image_5,image_6,image_7,image_8,image_9,gold_name,labels
0,andromeda,andromeda tree,image.155.jpg,image.68.jpg,image.9.jpg,image.72.jpg,image.158.jpg,image.86.jpg,image.7.jpg,image.132.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.86.jpg,5
1,angora,angora city,image.5.jpg,image.52.jpg,image.96.jpg,image.70.jpg,image.46.jpg,image.91.jpg,image.76.jpg,image.139.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.70.jpg,3
2,anteater,marsupial anteater,image.147.jpg,image.16.jpg,image.107.jpg,image.135.jpg,image.93.jpg,image.59.jpg,image.88.png,image.131.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.107.jpg,2
3,bank,bank erosion,image.104.jpg,image.64.jpg,image.108.jpg,image.80.jpg,image.21.jpg,image.99.jpg,image.117.jpg,image.146.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.64.jpg,1
4,router,internet router,image.127.jpg,image.0.jpg,image.20.jpg,image.18.jpg,image.112.jpg,image.97.jpg,image.24.jpg,image.1.jpg,...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,semeval-2023-task-1-V-WSD-v1/trial_v1/trial_im...,image.18.jpg,3


In [16]:
trial_df["image_2"].iloc[:5][4]

'semeval-2023-task-1-V-WSD-v1/trial_v1/trial_images_v1/image.20.jpg'

## Create HuggingFace dataset from pandas dataframe

In [17]:
type(Value)

type

In [18]:

# TODO: Specify each column datatype, refer to the output of the last cell (use "Features" class) (~1 line)
features = Features({
    "word" : Value(dtype="string", id=None),
    "phrase" : Value(dtype="string", id=None),
    "image_0_name": Value(dtype="string", id=None),
    "image_1_name": Value(dtype="string", id=None),
    "image_2_name": Value(dtype="string", id=None),
    "image_3_name": Value(dtype="string", id=None),
    "image_4_name": Value(dtype="string", id=None),
    "image_5_name": Value(dtype="string", id=None),
    "image_6_name": Value(dtype="string", id=None),
    "image_7_name": Value(dtype="string", id=None),
    "image_8_name": Value(dtype="string", id=None),
    "image_9_name": Value(dtype="string", id=None),
    "image_0": Image(decode=True, id=None),
    "image_1": Image(decode=True, id=None),
    "image_2": Image(decode=True, id=None),
    "image_3": Image(decode=True, id=None),
    "image_4": Image(decode=True, id=None),
    "image_5": Image(decode=True, id=None),
    "image_6": Image(decode=True, id=None),
    "image_7": Image(decode=True, id=None),
    "image_8": Image(decode=True, id=None),
    "image_9": Image(decode=True, id=None),
    "gold_name" : Value(dtype="string", id=None),
    "labels" : ClassLabel(num_classes=10, id=None),
})
# end of TODO


train_dataset = Dataset.from_pandas(train_df, features=features).shuffle(seed=GLOBAL_SEED).select(range(496))
trial_dataset = Dataset.from_pandas(trial_df, features=features).shuffle(seed=GLOBAL_SEED) # Whole trial dataset
en_test_dataset = Dataset.from_pandas(en_test_df, features=features).shuffle(seed=GLOBAL_SEED).select(range(64))
it_test_dataset = Dataset.from_pandas(it_test_df, features=features).shuffle(seed=GLOBAL_SEED).select(range(64))
fa_test_dataset = Dataset.from_pandas(fa_test_df, features=features).shuffle(seed=GLOBAL_SEED).select(range(64))

In [19]:
trial_dataset

Dataset({
    features: ['word', 'phrase', 'image_0_name', 'image_1_name', 'image_2_name', 'image_3_name', 'image_4_name', 'image_5_name', 'image_6_name', 'image_7_name', 'image_8_name', 'image_9_name', 'image_0', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'image_6', 'image_7', 'image_8', 'image_9', 'gold_name', 'labels'],
    num_rows: 16
})

## Preprocess data using AltCLIP processor

In [20]:
s = "mmd rft"
s.replace("rft", "").strip()

'mmd'

In [21]:
def get_woaw(phrase, word) -> str:
    # TODO: Omit the word from phrase (~1 line)
    woaw = phrase.replace(word, "").strip()
    # end of TODO
    return woaw

In [22]:
processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")

def process_function(examples):
    # TODO: Call processor and pass the pharse, WOAW (use "omit_ambiguous_word") and images as its arguments
    # Use return_tensors="pt", padding="max_length", truncation=True, max_length=64 (~1 line)
    print(examples)
    processor_output = processor(text=[examples["phrase"], get_woaw(examples["phrase"], examples["word"])],images=[examples[f"image_{i}"] for i in range(10)] ,return_tensors="pt", padding="max_length", truncation=True, max_length=64)
    # end of TODO
    print(processor_output)
    processor_output['phrase_input_ids'] = processor_output['input_ids'][:1,:]
    processor_output['phrase_attention_mask'] = processor_output['attention_mask'][:1,:]
    processor_output['woaw_input_ids'] = processor_output['input_ids'][1:,:]
    processor_output['woaw_attention_mask'] = processor_output['attention_mask'][1:,:]
    processor_output.pop('input_ids')
    processor_output.pop('attention_mask')
    return processor_output

processed_train_dataset = train_dataset.map(process_function, writer_batch_size=32, cache_file_name='alt_train_dataset_cache')
processed_trial_dataset = trial_dataset.map(process_function, writer_batch_size=32, cache_file_name='alt_trial_dataset_cache')
processed_en_test_dataset = en_test_dataset.map(process_function, writer_batch_size=32, cache_file_name='alt_en_test_dataset_cache')
processed_it_test_dataset = it_test_dataset.map(process_function, writer_batch_size=32, cache_file_name='alt_it_test_dataset_cache')
processed_fa_test_dataset = fa_test_dataset.map(process_function, writer_batch_size=32, cache_file_name='alt_fa_test_dataset_cache')



Map:   0%|          | 0/64 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
          [ 1.5216e+00,  1.5216e+00,  1.5216e+00,  ...,  8.5005e-01,
            8.3545e-01,  8.2086e-01],
          ...,
          [-3.1782e-01, -2.0103e-01, -2.8862e-01,  ...,  1.6384e+00,
            1.5070e+00,  1.4778e+00],
          [-5.2220e-01, -1.8644e-01,  6.1738e-02,  ...,  1.5800e+00,
            1.5216e+00,  1.1858e+00],
          [-4.4921e-01,  2.2232e-01,  5.4349e-01,  ...,  1.5216e+00,
            1.3756e+00,  1.1566e+00]],

         [[ 1.9398e+00,  1.9248e+00,  1.9248e+00,  ...,  1.3845e+00,
            1.3845e+00,  1.3545e+00],
          [ 1.9548e+00,  1.9398e+00,  1.9248e+00,  ...,  1.3545e+00,
            1.3695e+00,  1.3545e+00],
          [ 1.9548e+00,  1.9548e+00,  1.9398e+00,  ...,  1.3695e+00,
            1.3545e+00,  1.3395e+00],
          ...,
          [ 1.9891e-01,  3.4899e-01,  2.2893e-01,  ...,  1.7897e+00,
            1.6997e+00,  1.6247e+00],
          [-1.1196e-02,  2.8896e-01,  4.8406e-0

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
            1.9998e+00,  2.0299e+00],
          [ 1.4746e+00,  1.4596e+00,  1.4446e+00,  ...,  1.9848e+00,
            1.9998e+00,  2.0149e+00],
          ...,
          [ 8.5925e-01,  8.7426e-01,  8.5925e-01,  ...,  1.1594e+00,
            1.1444e+00,  1.1594e+00],
          [ 7.9922e-01,  8.2924e-01,  8.2924e-01,  ...,  1.1444e+00,
            1.1444e+00,  1.1444e+00],
          [ 7.0918e-01,  7.3919e-01,  7.6921e-01,  ...,  1.1294e+00,
            1.1294e+00,  1.1294e+00]],

         [[ 2.1459e+00,  2.1459e+00,  2.1317e+00,  ...,  2.1032e+00,
            2.1459e+00,  2.1459e+00],
          [ 2.1175e+00,  2.1317e+00,  2.1317e+00,  ...,  2.1317e+00,
            2.1459e+00,  2.1459e+00],
          [ 2.1317e+00,  2.1317e+00,  2.1317e+00,  ...,  2.1459e+00,
            2.1459e+00,  2.1459e+00],
          ...,
          [ 1.0794e+00,  1.0367e+00,  1.0225e+00,  ...,  1.2643e+00,
            1.2785e+00,  1.2785e+00],
         

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

{'word': 'شیر', 'phrase': 'شیر پرچرب', 'image_0_name': 'image.2793.jpg', 'image_1_name': 'image.1342.jpg', 'image_2_name': 'image.3856.jpg', 'image_3_name': 'image.3853.jpg', 'image_4_name': 'image.3852.jpg', 'image_5_name': 'image.3855.jpg', 'image_6_name': 'image.3147.jpg', 'image_7_name': 'image.3854.jpg', 'image_8_name': 'image.3851.jpg', 'image_9_name': 'image.3140.jpg', 'image_0': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x960 at 0x7FAE70151BD0>, 'image_1': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x700 at 0x7FAE70153E20>, 'image_2': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2148x2160 at 0x7FAE70152D40>, 'image_3': <PIL.JpegImagePlugin.JpegImageFile image mode=L size=1200x900 at 0x7FAE70152080>, 'image_4': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1704x2272 at 0x7FAE70153190>, 'image_5': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=780x470 at 0x7FAE70152230>, 'image_6': <PIL.JpegImagePlugin.JpegImageFile image 



[1;30;43mStreaming output truncated to the last 5000 lines.[0m

        [[[ 1.0544,  1.3172,  1.3756,  ...,  1.0836,  1.0252,  1.0252],
          [ 0.9084,  1.3318,  1.3756,  ...,  1.1128,  1.0106,  1.0690],
          [ 1.2442,  1.3318,  1.3172,  ...,  1.1712,  1.0836,  1.0836],
          ...,
          [ 1.2880,  1.2296,  1.3026,  ...,  1.3756,  1.3610,  1.0398],
          [ 1.2588,  1.3464,  1.3026,  ...,  1.4194,  1.3464,  1.0982],
          [ 1.2588,  1.3464,  1.3318,  ...,  1.4632,  1.2296,  1.1566]],

         [[ 1.0243,  1.2945,  1.3545,  ...,  1.0544,  1.0093,  1.0093],
          [ 0.8743,  1.3095,  1.3545,  ...,  1.0844,  0.9793,  1.0393],
          [ 1.2194,  1.3095,  1.2945,  ...,  1.1894,  1.1144,  1.0844],
          ...,
          [ 1.2645,  1.2044,  1.2795,  ...,  1.3545,  1.3845,  1.0544],
          [ 1.2344,  1.3245,  1.2795,  ...,  1.4446,  1.3695,  1.1144],
          [ 1.2495,  1.3395,  1.3095,  ...,  1.5046,  1.2495,  1.1744]],

         [[ 1.1221,  1.3780,  1.4349

In [None]:
processed_trial_dataset

Dataset({
    features: ['word', 'phrase', 'image_0_name', 'image_1_name', 'image_2_name', 'image_3_name', 'image_4_name', 'image_5_name', 'image_6_name', 'image_7_name', 'image_8_name', 'image_9_name', 'image_0', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'image_6', 'image_7', 'image_8', 'image_9', 'gold_name', 'labels', 'pixel_values', 'phrase_input_ids', 'phrase_attention_mask', 'woaw_input_ids', 'woaw_attention_mask'],
    num_rows: 16
})

## Extract and save features using AltCLIP

In [None]:
datasets_dict = {
    'train': concatenate_datasets([processed_train_dataset, processed_trial_dataset]).shuffle(seed=GLOBAL_SEED),
    'en_test': processed_en_test_dataset,
    'it_test': processed_it_test_dataset,
    'fa_test': processed_fa_test_dataset
}

In [25]:
torch.ones(4,4).view(2,8).cuda()

tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]], device='cuda:0')

In [7]:
def get_batch(dataset, batch_size, i):
    # TODO: Return the i-th batch from dataset (~1 line)
    batch = dataset[batch_size * i: batch_size * i + batch_size]
    #end of TODO
    return batch

def remove_second_dim(tensor):
    # TODO: remove the second dim of tensor (~1 line)
    tensor = tensor.squeeze(dim=1)
    # end of TODO
    return tensor

In [8]:
def extract_features(datasets_dict, batch_size=1):
    # print("mmd")
    model = AltCLIPModel.from_pretrained("BAAI/AltCLIP").to('cuda:0')
    # print("after mmd")
    for processed_dataset_name, processed_dataset in datasets_dict.items():
        features_directory = f'alt_{processed_dataset_name}_features'
        if os.path.exists(features_directory):
            shutil.rmtree(features_directory, ignore_errors=True)
        os.mkdir(features_directory)
        processed_dataset.set_format(type='torch', columns=['phrase_input_ids', 'phrase_attention_mask', 'woaw_input_ids', 'woaw_attention_mask', 'pixel_values', 'labels'])
        for i in tqdm(range(0, processed_dataset.num_rows, batch_size)):
            # print(get_batch(processed_dataset, batch_size, i))
            batch = get_batch(processed_dataset, batch_size, i)
            # print(f"batch is {batch}")
            phrase_input_ids = remove_second_dim(batch['phrase_input_ids'])
            phrase_attention_mask = remove_second_dim(batch['phrase_attention_mask'])
            woaw_input_ids = remove_second_dim(batch['woaw_input_ids'])
            woaw_attention_mask = remove_second_dim(batch['woaw_attention_mask'])
            batch_size, image_count, image_dim0, image_dim1, image_dim2 = batch['pixel_values'].shape
            # TODO: Reshape batch['pixel_values'] to (batch_size * image_count, image_dim0, image_dim1, image_dim2) (~1 line)
            pixel_values = batch["pixel_values"].view(batch_size * image_count, image_dim0, image_dim1, image_dim2)
            # end of TODO

            # TODO: Move phrase_input_ids, phrase_attention_mask, woaw_input_ids, woaw_attention_mask, pixel_values to GPU (~5 lines)
            phrase_input_ids = phrase_input_ids.cuda()
            phrase_attention_mask = phrase_attention_mask.cuda()
            woaw_input_ids = woaw_input_ids.cuda()
            woaw_attention_mask = woaw_attention_mask.cuda()
            pixel_values = pixel_values.cuda()
            # end of TODO
            
            torch.save(torch.unsqueeze(model.get_text_features(phrase_input_ids, phrase_attention_mask), dim=1), f'{features_directory}/phrase_features_{i}.pt')
            torch.save(torch.unsqueeze(model.get_text_features(woaw_input_ids, woaw_attention_mask), dim=1), f'{features_directory}/woaw_features_{i}.pt')
            torch.save(torch.reshape(model.get_image_features(pixel_values), (batch_size, image_count, -1)), f'{features_directory}/image_features_{i}.pt')
    
extract_features(datasets_dict)

`text_config_dict` is provided which will be used to initialize `AltCLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
100%|██████████| 512/512 [07:02<00:00,  1.21it/s]
100%|██████████| 64/64 [00:51<00:00,  1.23it/s]
100%|██████████| 64/64 [00:55<00:00,  1.15it/s]
100%|██████████| 64/64 [00:54<00:00,  1.17it/s]


## Add exctracted features to HuggingFace datasets

**Restart Your Runtime Here to Avoid Out of Memory Issue!**

In [2]:
import os

In [None]:
os.kill(os.getpid(), 9)

### Import requirements after restart again

In [32]:
from tqdm import tqdm
import gc
import os
import shutil
import glob
import random
import numpy as np
import torch
from torch import nn
import pandas as pd
from PIL import Image as PIL_Image
from PIL import ImageFile

from dataclasses import dataclass
from typing import Optional, Tuple, Union

import evaluate


from datasets import Dataset, Features, Value, Image, ClassLabel, concatenate_datasets
from transformers import AltCLIPProcessor, AltCLIPModel
from transformers.configuration_utils import PretrainedConfig
from transformers.models.altclip.modeling_altclip import AltCLIPEncoderLayer
from transformers.modeling_utils import PreTrainedModel
from transformers.modeling_outputs import BaseModelOutput, ModelOutput
from transformers import TrainingArguments, Trainer

### Set some parameters to have determinism after restart again

In [2]:
ImageFile.LOAD_TRUNCATED_IMAGES = True
PIL_Image.MAX_IMAGE_PIXELS = 933120000

GLOBAL_SEED = 10

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


### Load datasets from cache files

In [3]:
processed_train_dataset = Dataset.from_file('alt_train_dataset_cache')
processed_trial_dataset = Dataset.from_file('alt_trial_dataset_cache')
processed_en_test_dataset = Dataset.from_file('alt_en_test_dataset_cache')
processed_it_test_dataset = Dataset.from_file('alt_it_test_dataset_cache')
processed_fa_test_dataset = Dataset.from_file('alt_fa_test_dataset_cache')

### Add features to datasets

In [4]:
datasets_dict = {
    'train': concatenate_datasets([processed_train_dataset, processed_trial_dataset]).shuffle(seed=GLOBAL_SEED),
    'en_test': processed_en_test_dataset,
    'it_test': processed_it_test_dataset,
    'fa_test': processed_fa_test_dataset
}



In [5]:
torch.tensor([1,2,3]).tolist()

[1, 2, 3]

In [6]:
def add_features_to_datasets(datasets_dict, batch_size=1):
    for processed_dataset_name, processed_dataset in datasets_dict.items():
        datasets_dict[processed_dataset_name] = datasets_dict[processed_dataset_name].remove_columns([
            'phrase_input_ids', 'phrase_attention_mask', 'woaw_input_ids', 'woaw_attention_mask', 'pixel_values'
        ])
        features_directory = f'alt_{processed_dataset_name}_features'
        phrase_features = []
        woaw_features = []
        image_features = []
        for i in tqdm(range(0, processed_dataset.num_rows, batch_size)):
            phrase_features.append(torch.load(f'{features_directory}/phrase_features_{i}.pt'))
            woaw_features.append(torch.load(f'{features_directory}/woaw_features_{i}.pt'))
            image_features.append(torch.load(f'{features_directory}/image_features_{i}.pt'))
        phrase_features = torch.cat(phrase_features, dim=0)
        woaw_features = torch.cat(woaw_features, dim=0)
        image_features = torch.cat(image_features, dim=0)


        # TODO: Add phrase_features, woaw_features, image_features to datasets_dict[processed_dataset_name] (~3 lines)
        datasets_dict[processed_dataset_name] = datasets_dict[processed_dataset_name].add_column("phrase_features", phrase_features.tolist())
        datasets_dict[processed_dataset_name] = datasets_dict[processed_dataset_name].add_column("woaw_features", woaw_features.tolist())
        datasets_dict[processed_dataset_name] = datasets_dict[processed_dataset_name].add_column("image_features", image_features.tolist())
        # end of TODO
        shutil.rmtree(features_directory, ignore_errors=True)


add_features_to_datasets(datasets_dict)

100%|██████████| 512/512 [00:10<00:00, 50.26it/s] 


Flattening the indices:   0%|          | 0/512 [00:00<?, ? examples/s]

100%|██████████| 64/64 [00:00<00:00, 685.64it/s]
100%|██████████| 64/64 [00:00<00:00, 716.18it/s]
100%|██████████| 64/64 [00:00<00:00, 576.67it/s]


In [7]:
datasets_dict['train']

Dataset({
    features: ['word', 'phrase', 'image_0_name', 'image_1_name', 'image_2_name', 'image_3_name', 'image_4_name', 'image_5_name', 'image_6_name', 'image_7_name', 'image_8_name', 'image_9_name', 'image_0', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'image_6', 'image_7', 'image_8', 'image_9', 'gold_name', 'labels', 'phrase_features', 'woaw_features', 'image_features'],
    num_rows: 512
})

# Train and evaluate

## Construct model

### Model config class

In [8]:
class RAltCLIPConfig(PretrainedConfig):

    def __init__(
        self,
        hidden_size=768,
        intermediate_size=3072,
        num_hidden_layers=3,
        num_attention_heads=8,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-05,
        attention_dropout=0.0,
        num_images_to_rank=10,
        logit_scale_init_value=2.6592,
        loss_func="CE",
        **kwargs
    ):
        super().__init__(**kwargs)

        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.attention_dropout = attention_dropout
        self.hidden_act = hidden_act
        self.layer_norm_eps = layer_norm_eps
        self.num_images_to_rank = num_images_to_rank
        self.logit_scale_init_value = logit_scale_init_value
        self.loss_func = loss_func

### Encoder class

In [9]:
# Copied from transformers.models.clip.modeling_altclip.AltCLIPEncoder with AltCLIP->RAltCLIP
class RAltCLIPEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`AltCLIPEncoderLayer`].
    Args:
        config: AltCLIPConfig
    """

    def __init__(self, config: RAltCLIPConfig):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList([AltCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        r"""
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        encoder_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        hidden_states = inputs_embeds
        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            if self.gradient_checkpointing and self.training:

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs, output_attentions)

                    return custom_forward

                layer_outputs = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(encoder_layer),
                    hidden_states,
                    attention_mask,
                    causal_attention_mask,
                )
            else:
                layer_outputs = encoder_layer(
                    hidden_states,
                    attention_mask,
                    causal_attention_mask,
                    output_attentions=output_attentions,
                )

            hidden_states = layer_outputs[0]

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )

### Output data class

In [10]:
@dataclass
class RAltCLIPOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None

### Model class

In [35]:
class RAltCLIPModel(PreTrainedModel):
    """Relative AltCLIP"""

    config_class = RAltCLIPConfig
    
    def __init__(self, config: RAltCLIPConfig):
        super().__init__(config)
        self.rank_encoder = RAltCLIPEncoder(config)
        self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value)
        self.num_images_to_rank = config.num_images_to_rank
        self.loss_func = config.loss_func

    def forward(
        self,
        image_features: torch.FloatTensor,
        phrase_features: torch.FloatTensor,
        woaw_features: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
    ) -> RAltCLIPOutput:
        if woaw_features != None:
            phrase_image_embeds = torch.cat((phrase_features, image_features, woaw_features), dim=1)
        else:
            phrase_image_embeds = torch.cat((phrase_features, image_features), dim=1)

        # TODO: Pass phrase_image_embeds to rank_encoder and get sum of all hidden states (~2 lines)
        rank_encoder_output = self.rank_encoder(phrase_image_embeds, output_hidden_states=True)
        # print(f"last hidden  output is : {rank_encoder_output.last_hidden_state.shape}")
        # print(f"rank encoder output is : {rank_encoder_output.shape}")
        hidden_states_sum = torch.sum(torch.stack(rank_encoder_output.hidden_states), dim=0)
        # hidden_states_sum = hidden_states_sum.unsqueeze(1)
        # print(hidden_states_sum.shape)
        # end of TODO
        
        # normalized features
        hidden_states_sum = hidden_states_sum / hidden_states_sum.norm(p=2, dim=-1, keepdim=True)
        # print(hidden_states_sum.shape)
        phrase_embeds_sum = hidden_states_sum[:,0,:].unsqueeze(1)
        image_embeds_sum = hidden_states_sum[:,1:11,:]
        
        # cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_phrase = torch.matmul(phrase_embeds_sum, torch.transpose(image_embeds_sum, -1, -2)) * logit_scale
        logits = logits_per_phrase.squeeze(1)
        
        loss = None
        if labels is not None:
            if self.loss_func == 'CE':
                output_loss_fct = nn.CrossEntropyLoss()
                loss = output_loss_fct(logits.view(-1, self.num_images_to_rank), labels.view(-1))
            elif self.loss_func == 'SIM':
                _, _, hidden_size = phrase_embeds_sum.shape
                similarity_loss_fct = nn.CosineEmbeddingLoss()
                similarity_labels = torch.nn.functional.one_hot(labels, self.num_images_to_rank)
                similarity_labels = torch.where(similarity_labels == 1, 1, -1)
                loss = similarity_loss_fct(
                    torch.repeat_interleave(phrase_embeds_sum, self.num_images_to_rank, dim=1).reshape(-1, hidden_size), 
                    image_embeds_sum.reshape(-1, hidden_size), 
                    similarity_labels.reshape(-1)
                )
            
        return RAltCLIPOutput(
            loss=loss,
            logits=logits,
        )

## Load evaluation metrics

In [36]:
accuracy_metric = evaluate.load('accuracy')
mrr_metric = evaluate.load('posicube/mean_reciprocal_rank')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    accuracy = accuracy_metric.compute(predictions=predictions.argmax(axis=-1), references=labels)
    mrr = mrr_metric.compute(predictions=np.where(np.argsort(-predictions, axis=1) == np.expand_dims(labels, axis=1))[1])
    return {'accuracy': accuracy['accuracy'], 'mrr': mrr['mrr']}

## Train and evaluate

In [37]:
model_name = 'RAltCLIP_CE_12_T'

def model_init():
    config = RAltCLIPConfig(dropout=0.5, attention_dropout=0.5, loss_func='CE')
    model = RAltCLIPModel(config)
    return model

training_args = TrainingArguments(
    output_dir=model_name,
    logging_strategy='epoch',
    logging_steps=1,
    save_strategy='epoch',
    save_steps=1,
    metric_for_best_model='accuracy',
    per_device_train_batch_size=256,
    num_train_epochs=3,
    seed=GLOBAL_SEED,
    weight_decay=0.2
)

# TODO: Set the argumetns 
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=datasets_dict['train'],
    compute_metrics=compute_metrics,
)
# end of TODO

trainer.train()

eval_en_result = trainer.evaluate(datasets_dict['en_test'])
eval_it_result = trainer.evaluate(datasets_dict['it_test'])
eval_fa_result = trainer.evaluate(datasets_dict['fa_test'])

print(f"MODEL: {model_name}")
print(f"EN: Acc: {eval_en_result['eval_accuracy']}, MRR: {eval_en_result['eval_mrr']}")
print(f"IT: Acc: {eval_it_result['eval_accuracy']}, MRR: {eval_it_result['eval_mrr']}")
print(f"FA: Acc: {eval_fa_result['eval_accuracy']}, MRR: {eval_fa_result['eval_mrr']}")
print(f"OVERAL: Acc: {(eval_en_result['eval_accuracy']+eval_it_result['eval_accuracy']+eval_fa_result['eval_accuracy'])/3}, MRR: {(eval_en_result['eval_mrr']+eval_it_result['eval_mrr']+eval_fa_result['eval_mrr'])/3}")

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
2,1.3015
4,1.0605
6,0.9604


MODEL: RAltCLIP_CE_12_T
EN: Acc: 0.578125, MRR: 0.7354724702380954
IT: Acc: 0.28125, MRR: 0.4773809523809523
FA: Acc: 0.09375, MRR: 0.3039682539682539
OVERAL: Acc: 0.3177083333333333, MRR: 0.5056072255291005
