# WRITING THE MODEL HERE FOR LOADING WEIGHTS

In [7]:
import pandas as pd
import os
import torch
import torch.nn as nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer

class VQAModel(nn.Module):
    def __init__(self):
        super(VQAModel, self).__init__()
        self.project_down = nn.Linear(768*2, 768)   
        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
        self.gpt2_model = GPT2LMHeadModel.from_pretrained("distilbert/distilgpt2")

    def forward(self, image_features, question_features):
        
        # Concatenating features
        combined_features = torch.cat((image_features, question_features), dim=-1) 

        # Resizing features to match inpput dimensions
        combined_features = self.project_down(combined_features) # Resizing features to match inpput dimensions
        combined_features = combined_features.unsqueeze(1) 
        
        # Generate outputs
        outputs = self.gpt2_model(inputs_embeds=combined_features)
        logits = outputs.logits
        eos_token_id = self.gpt2_tokenizer.encode("<END>", add_prefix_space=True)[0]
        generated_sequence = self.gpt2_model.generate(inputs_embeds=combined_features, 
                                                      max_length=2, 
                                                      pad_token_id=eos_token_id, 
                                                      repetition_penalty=5.7,
                                                      temperature=0.9,
                                                      eos_token_id=eos_token_id)
        return logits, generated_sequence

In [None]:
import torch  # PyTorch library for tensor computations and deep learning

# -----------------------------------------------------------------------------------
# Model Loading and Preparation
# -----------------------------------------------------------------------------------

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = VQAModel().to(device)
print("VQAModel initialized and moved to device.")

# Define the path to the pre-trained model's state dictionary
model_path = '/kaggle/working/model_cpu.pth'
print(f"Loading model state dictionary from: {model_path}")

# Load the state dictionary from the specified path
# torch.load handles loading the saved state dictionary
# If the model was saved on a GPU and you're loading it on a CPU, use map_location
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict)
print("Model state dictionary loaded successfully.")


  model.load_state_dict(torch.load(model_path))


<All keys matched successfully>

In [None]:
import numpy as np
import pickle

gif_feat_dic = "/kaggle/input/cleaned-gif-embeddings/combined_file.pkl"
pickle_dataset = "/kaggle/input/cleaned-gif-embeddings/questions_with_gpt2_embeddings_using_gpu_cleaned.pkl"

with open(gif_feat_dic, 'rb') as f:
    gif_feat_dict = pickle.load(f)
tgif_frame = pd.read_pickle(pickle_dataset)

  return torch.load(io.BytesIO(b))


# WRITING A FUNCTION TO GET THE VDEO EMBEDDINGS AND TEXT EMBEDDINGS 

In [None]:
def getitem(idx):
    """
    Retrieves a sample from the tgif_frame dataset given an index and returns the relevant features 
    such as GIF embeddings, question features, and answers.

    Args:
    idx (int): The index of the row in the tgif_frame dataset to retrieve the sample.

    Returns:
    dict: A dictionary containing the following:
        - 'gif_embeddings': Precomputed GIF embeddings from gif_feat_dict.
        - 'question_embeddings': Tensor containing the question features.
        - 'answer': The corresponding answer for the question, appended with "<END>" token.
        - 'url': The URL of the GIF.
        - 'question': The textual question.
    """
    # Gets question features
    question_features = torch.from_numpy(tgif_frame.iloc[idx, 4])

    # Gets answers
    answers = tgif_frame.iloc[idx, 3]

    # Gets the GIF features from the dictionary
    gif_name = tgif_frame.iloc[idx, 1] + '.gif'
    gif_features = gif_feat_dict[gif_name]
    gif_url = tgif_frame.iloc[idx, 0]
    question = tgif_frame.iloc[idx, 2]

    sample = {
        'gif_embeddings': gif_features,
        'question_embeddings': question_features,
        'answer': answers,
        'url': gif_url,
        'question': question
    }

    return sample


# INFERENCE FUNCTION

In [None]:
def inference(n, model, tokenizer, device):
    """
    Runs inference on the model using the data at index `n`, returning the generated answer, the original question,
    and the URL of the GIF.

    Args:
    n (int): The index of the data point to use for inference.
    model (torch.nn.Module): The VQA model used for generating answers.
    tokenizer (GPT2Tokenizer): Tokenizer for decoding the generated answer tokens.
    device (torch.device): The device (CPU or GPU) to run the inference on.

    Returns:
    dict: A dictionary containing:
        - 'question': The input question.
        - 'generated_answers': The model-generated answers.
        - 'gif_url': The URL of the GIF.
        - 'expected_answer': The actual answer from the dataset.
    """
    
    # Fetch the data for the given index `n`
    batch = getitem(n)
    
    # Extract image features, question features, and the answer from the batch
    image_features, question_features, answers = batch['gif_embeddings'], batch['question_embeddings'], batch['answer']
    image_features = image_features.unsqueeze(0).to(device)
    question_features = question_features.unsqueeze(0).to(device)
    model.eval()
    with torch.no_grad():
        logits, gen_seq = model(image_features, question_features)
    generated_answers = [tokenizer.decode(g, skip_special_tokens=True) for g in gen_seq]
    
    # Create a dictionary to return the relevant information
    result = {
        'question': batch['question'],
        'generated_answers': generated_answers,
        'gif_url': batch['url'],
        'expected_answer': answers
    }
    
    return result




image_features shape: torch.Size([1, 768])
question_features shape: torch.Size([1, 768])


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


what does the guy kick a dog and then hits him in the crotch with a rake ?
['dog']
https://33.media.tumblr.com/9cd7ae01d3187758321523d08fc60db4/tumblr_njrzdxnmrz1tgetb4o1_250.gif
dog <END>


In [None]:
# Initialize the tokenizer and device
tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Choose a random index for inference
n = np.random.randint(0, 18000)
result = inference(n, model, tokenizer, device)
print("Question:", result['question'])
print("Generated Answer:", result['generated_answers'])
print("GIF URL:", result['gif_url'])
print("Expected Answer:", result['expected_answer'])
