In [1]:
import timm
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import models,transforms
import torch_geometric

  from .autonotebook import tqdm as notebook_tqdm


: 

: 

In [16]:
if torch.cuda.is_available():
    device = 'cuda' 
    n_gpus = torch.cuda.device_count()
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

In [17]:
from Dataset.HatefulMemeDataset import HatefulMemeDataset

dataset = HatefulMemeDataset('./data','train') 


In [18]:
class ImageEncoder(nn.Module):
    """
    Encode images to a fixed size vector
    """

    def __init__(
        self, model_name='resnet50', pretrained=True, trainable=True
    ):
        super().__init__()
        self.model = timm.create_model(
            model_name, pretrained, num_classes=0, global_pool="avg"
        )
        for p in self.model.parameters():
            p.requires_grad = trainable

    def forward(self, x):
        return self.model(x)

In [19]:
class TextEncoder(nn.Module):
    def __init__(self, model_name='distilbert-base-uncased', pretrained=True, trainable=True):
        super().__init__()
        if pretrained:
            self.model = DistilBertModel.from_pretrained(model_name)
        else:
            self.model = DistilBertModel(config=DistilBertConfig())
            
        for p in self.model.parameters():
            p.requires_grad = trainable

        # we are using the CLS token hidden representation as the sentence's embedding
        self.target_token_idx = 0

    def forward(self, input_ids, attention_mask):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = output.last_hidden_state
        return last_hidden_state[:, self.target_token_idx, :]

In [20]:
class ProjectionHead(nn.Module):
    def __init__(
        self,
        embedding_dim,
        projection_dim=256,
        dropout=0.1
    ):
        super().__init__()
        self.projection = nn.Linear(embedding_dim, projection_dim)
        self.gelu = nn.GELU()
        self.fc = nn.Linear(projection_dim, projection_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(projection_dim)
    
    def forward(self, x):
        projected = self.projection(x)
        x = self.gelu(projected)
        x = self.fc(x)
        x = self.dropout(x)
        x = x + projected
        x = self.layer_norm(x)
        return x

In [23]:
device = 'cpu'

In [24]:
image,text,label = dataset[0]
imgTensor = transforms.ToTensor()(image).unsqueeze(0).to(device)

./data/img/42953.png


In [26]:
imgModel = ImageEncoder()
image_features = imgModel(imgTensor)

In [40]:
image_projection = ProjectionHead(2048)

In [27]:
# def get_image_embeddings(valid_df, model_path):
#     tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
#     valid_loader = build_loaders(valid_df, tokenizer, mode="valid")
    
#     model = CLIPModel().to(CFG.device)
#     model.load_state_dict(torch.load(model_path, map_location=CFG.device))
#     model.eval()
    
#     valid_image_embeddings = []
#     with torch.no_grad():
#         for batch in tqdm(valid_loader):
#             image_features = model.image_encoder(batch["image"].to(CFG.device))
#             image_embeddings = model.image_projection(image_features)
#             valid_image_embeddings.append(image_embeddings)
#     return model, torch.cat(valid_image_embeddings)

torch.Size([1, 2048])

In [29]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
encoded_query = tokenizer([text])

In [32]:
txtEncoder = TextEncoder()

Downloading pytorch_model.bin: 100%|██████████| 256M/256M [00:06<00:00, 38.4MB/s] 
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [41]:
text_projection = ProjectionHead(768)

In [34]:
encoded_query

{'input_ids': [[101, 2049, 2037, 2839, 2025, 2037, 3609, 2008, 5609, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [36]:
batch = {
        key: torch.tensor(values).to(device)
        for key, values in encoded_query.items()
    }

In [37]:
batch

{'input_ids': tensor([[ 101, 2049, 2037, 2839, 2025, 2037, 3609, 2008, 5609,  102]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [38]:
text_features = txtEncoder(
        input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
    )

In [39]:
text_features.shape

torch.Size([1, 768])

In [42]:
text_embeddings = text_projection(text_features)
text_embeddings.shape


In [44]:
image_embeddings = image_projection(image_features)
image_embeddings.shape

In [52]:
image_embeddings @ image_embeddings.T
text_embeddings @ text_embeddings.T
text_embeddings @ image_embeddings.T

torch.Size([1, 256])

In [55]:
imgTensor.shape

torch.Size([1, 3, 400, 265])

In [None]:
torch_geometric.__version__


In [50]:
class HatefulMemeDataset(torch_gemo.utils.data.Dataset):


tensor([[255.9594]], grad_fn=<MmBackward0>)

tensor([[-20.4560]], grad_fn=<MmBackward0>)