# WRITING THE MODEL HERE FOR LOADING WEIGHTS

In [7]:
import pandas as pd
import os
import torch
import torch.nn as nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer

class VQAModel(nn.Module):
    def __init__(self):
        super(VQAModel, self).__init__()
        self.gpt2_model = GPT2LMHeadModel.from_pretrained("distilbert/distilgpt2") 
        self.project_down = nn.Linear(768*2, 768)
        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")

    def forward(self, image_features, question_features):
        print("image_features shape:", image_features.shape)
        print("question_features shape:", question_features.shape)
        combined_features = torch.cat((image_features, question_features), dim=-1)
        combined_features = self.project_down(combined_features)
        combined_features = combined_features.unsqueeze(1) 
        outputs = self.gpt2_model(inputs_embeds=combined_features)
        logits = outputs.logits
        eos_token_id = self.gpt2_tokenizer.encode("<END>", add_prefix_space=True)[0]
        generated_sequence = self.gpt2_model.generate(inputs_embeds=combined_features, 
                                                      max_length=2, 
                                                      pad_token_id=eos_token_id, 
                                                      repetition_penalty=5.7,
                                                      temperature=0.9,
                                                      eos_token_id=eos_token_id)
        return logits, generated_sequence

In [2]:
import torch  # PyTorch library for tensor computations and deep learning
import torch.nn as nn  # Neural network modules and layers
import torch.optim as optim  # Optimization algorithms
from tqdm import tqdm  # For displaying progress bars during iterations

# -----------------------------------------------------------------------------------
# Model Loading and Preparation
# -----------------------------------------------------------------------------------

# Determine the computing device: use GPU ('cuda') if available, else fallback to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the VQA (Visual Question Answering) model and move it to the selected device
model = VQAModel().to(device)

# Wrap the model with DataParallel to utilize multiple GPUs if available
# DataParallel allows parallel processing across multiple GPUs to speed up computations
model = torch.nn.DataParallel(model)

# -----------------------------------------------------------------------------------
# Loading Pre-trained Model Weights
# -----------------------------------------------------------------------------------

# Define the path to the pre-trained model's state dictionary
model_path = '/kaggle/input/open-ended-vqa-qa/best_model.pth'

# Load the state dictionary from the specified path
# torch.load handles loading the saved state dictionary
state_dict = torch.load(model_path)

# Load the state dictionary into the model
# This updates the model's parameters with the pre-trained weights
model.load_state_dict(state_dict)

# Extract the underlying model from DataParallel wrapper
# When using DataParallel, the actual model is accessible via the `.module` attribute
model_without_dp = model.module

# -----------------------------------------------------------------------------------
# Saving the Model's State Dictionary Without DataParallel
# -----------------------------------------------------------------------------------

# Move the model to CPU to ensure compatibility when loading without GPU
# This is useful for inference on machines without CUDA support
model_without_dp.cpu()

# Define the path where the CPU-compatible state dictionary will be saved
cpu_model_path = '/kaggle/working/model_cpu.pth'

# Save the state dictionary of the model without DataParallel
# This allows for easier loading in environments that do not use multiple GPUs
torch.save(model_without_dp.state_dict(), cpu_model_path)

# -----------------------------------------------------------------------------------
# Summary
# -----------------------------------------------------------------------------------

"""
This script performs the following operations:

1. **Device Configuration**:
    - Checks if a GPU is available and sets the computation device accordingly.

2. **Model Initialization**:
    - Instantiates the `VQAModel` and moves it to the selected device.
    - Wraps the model with `DataParallel` to enable parallel processing across multiple GPUs, enhancing computational efficiency.

3. **Loading Pre-trained Weights**:
    - Loads a pre-trained model's state dictionary from a specified path.
    - Updates the initialized model's parameters with these pre-trained weights.

4. **Preparing Model for CPU Inference**:
    - Extracts the underlying model from the `DataParallel` wrapper to remove dependencies on multiple GPUs.
    - Moves the model to CPU to ensure compatibility in environments without CUDA support.
    - Saves the CPU-compatible state dictionary to a designated path for future inference or deployment.

**Use Case**:
This script is particularly useful when you have a model trained on a multi-GPU setup and wish to deploy or perform inference on a single GPU or CPU environment. By saving a CPU-compatible version of the model's state dictionary, you ensure broader compatibility and ease of deployment across different hardware configurations.
"""



config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  model.load_state_dict(torch.load(model_path))


In [3]:
import torch  # PyTorch library for tensor computations and deep learning

# -----------------------------------------------------------------------------------
# Model Loading and Preparation
# -----------------------------------------------------------------------------------

# Determine the computing device: use GPU ('cuda') if available, else fallback to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize the VQA (Visual Question Answering) model and move it to the selected device
# Ensure that the VQAModel class is defined elsewhere in your codebase
model = VQAModel().to(device)
print("VQAModel initialized and moved to device.")

# Define the path to the pre-trained model's state dictionary
model_path = '/kaggle/working/model_cpu.pth'
print(f"Loading model state dictionary from: {model_path}")

# Load the state dictionary from the specified path
# torch.load handles loading the saved state dictionary
# If the model was saved on a GPU and you're loading it on a CPU, use map_location
state_dict = torch.load(model_path, map_location=device)

# Load the state dictionary into the model
# This updates the model's parameters with the pre-trained weights
model.load_state_dict(state_dict)
print("Model state dictionary loaded successfully.")

# -----------------------------------------------------------------------------------
# Summary
# -----------------------------------------------------------------------------------

"""
This script performs the following operations:

1. **Device Configuration**:
    - Checks if a GPU is available and sets the computation device accordingly.
    - Prints out the device being used for computations.

2. **Model Initialization**:
    - Instantiates the `VQAModel` and moves it to the selected device (GPU or CPU).
    - Prints a confirmation message upon successful initialization.

3. **Loading Pre-trained Weights**:
    - Specifies the file path to the saved state dictionary (`.pth` file) of the pre-trained model.
    - Loads the state dictionary from the specified path, ensuring compatibility with the selected device.
    - Updates the initialized model's parameters with these pre-trained weights.
    - Prints a confirmation message upon successful loading of the state dictionary.

**Use Case**:
This script is particularly useful when you have a pre-trained VQA model saved as a state dictionary and wish to load it for further inference, evaluation, or continued training. By specifying the device dynamically, the script ensures flexibility across different hardware setups without manual intervention.
"""



  model.load_state_dict(torch.load(model_path))


<All keys matched successfully>

In [8]:
import numpy as np
import pickle

gif_feat_dic = "/kaggle/input/cleaned-gif-embeddings/combined_file.pkl"
pickle_dataset = "/kaggle/input/cleaned-gif-embeddings/questions_with_gpt2_embeddings_using_gpu_cleaned.pkl"

with open(gif_feat_dic, 'rb') as f:
    gif_feat_dict = pickle.load(f)
tgif_frame = pd.read_pickle(pickle_dataset)

  return torch.load(io.BytesIO(b))


# WRITING A FUNCTION TO GET THE VDEO EMBEDDINGS AND TEXT EMBEDDINGS 

In [5]:
def getitem(idx):
    """
    Retrieves a sample from the tgif_frame dataset given an index and returns the relevant features 
    such as GIF embeddings, question features, and answers.

    Args:
    idx (int): The index of the row in the tgif_frame dataset to retrieve the sample.

    Returns:
    dict: A dictionary containing the following:
        - 'gif_embeddings': Precomputed GIF embeddings from gif_feat_dict.
        - 'question_embeddings': Tensor containing the question features.
        - 'answer': The corresponding answer for the question, appended with "<END>" token.
        - 'url': The URL of the GIF.
        - 'question': The textual question.
    """
    
    # Retrieve the GIF name by adding '.gif' extension to the value from the second column
    gif_name = tgif_frame.iloc[idx, 1] + '.gif'
    
    # Extract the question features (assumed to be a NumPy array) and convert it to a PyTorch tensor
    question_features = torch.from_numpy(tgif_frame.iloc[idx, 4])
    
    # Retrieve the answer from the third column, and append the "<END>" token for sequence processing
    answers = tgif_frame.iloc[idx, 3] + " <END>"
    
    # Get the precomputed GIF embeddings from a dictionary using the gif_name as the key
    gif_features = gif_feat_dict[gif_name]
    
    # Retrieve the URL of the GIF from the first column
    gif_url = tgif_frame.iloc[idx, 0]
    
    # Get the textual question from the second column
    question = tgif_frame.iloc[idx, 2]

    # Construct and return the sample dictionary with the retrieved features
    sample = {
        'gif_embeddings': gif_features,
        'question_embeddings': question_features,
        'answer': answers,
        'url': gif_url,
        'question': question
    }

    return sample


# INFERENCE FUNCTION

In [6]:
def inference(n, model, tokenizer, device):
    """
    Runs inference on the model using the data at index `n`, returning the generated answer, the original question,
    and the URL of the GIF.

    Args:
    n (int): The index of the data point to use for inference.
    model (torch.nn.Module): The VQA model used for generating answers.
    tokenizer (GPT2Tokenizer): Tokenizer for decoding the generated answer tokens.
    device (torch.device): The device (CPU or GPU) to run the inference on.

    Returns:
    dict: A dictionary containing:
        - 'question': The input question.
        - 'generated_answers': The model-generated answers.
        - 'gif_url': The URL of the GIF.
        - 'expected_answer': The actual answer from the dataset.
    """
    
    # Fetch the data for the given index `n`
    batch = getitem(n)
    
    # Extract image features, question features, and the answer from the batch
    image_features, question_features, answers = batch['gif_embeddings'], batch['question_embeddings'], batch['answer']
    
    # Unsqueeze the features to add a batch dimension and move them to the specified device
    image_features = image_features.unsqueeze(0).to(device)
    question_features = question_features.unsqueeze(0).to(device)
    
    # Set the model to evaluation mode
    model.eval()
    
    # Forward pass through the model to get logits and the generated sequence of tokens
    with torch.no_grad():
        logits, gen_seq = model(image_features, question_features)
    
    # Decode the generated token sequences into human-readable strings
    generated_answers = [tokenizer.decode(g, skip_special_tokens=True) for g in gen_seq]
    
    # Create a dictionary to return the relevant information
    result = {
        'question': batch['question'],
        'generated_answers': generated_answers,
        'gif_url': batch['url'],
        'expected_answer': answers
    }
    
    return result




image_features shape: torch.Size([1, 768])
question_features shape: torch.Size([1, 768])


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


what does the guy kick a dog and then hits him in the crotch with a rake ?
['dog']
https://33.media.tumblr.com/9cd7ae01d3187758321523d08fc60db4/tumblr_njrzdxnmrz1tgetb4o1_250.gif
dog <END>


In [1]:
# Initialize the tokenizer and device
tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Choose a random index for inference
n = np.random.randint(0, 18000)

# Perform inference
result = inference(n, model, tokenizer, device)

# Print the outputs
print("Question:", result['question'])
print("Generated Answer:", result['generated_answers'])
print("GIF URL:", result['gif_url'])
print("Expected Answer:", result['expected_answer'])
