# THE GIVEN CODE SHOWS THE PROCESS OF GENERATING QUESTION EMBEDDINGS

In [None]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model
from tqdm import tqdm 

# -----------------------------------------------------------------------------------
# Configuration and Setup
# -----------------------------------------------------------------------------------


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")

# -----------------------------------------------------------------------------------
# Loading the Pre-trained GPT-2 Model and Tokenizer
# -----------------------------------------------------------------------------------


tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-medium")


new_pad_token = "[PAD]"


if new_pad_token not in tokenizer.get_vocab():
    tokenizer.add_special_tokens({'pad_token': new_pad_token})
    print(f"Added new pad token: {new_pad_token}")
model = GPT2Model.from_pretrained("openai-community/gpt2-medium")
model.resize_token_embeddings(len(tokenizer))
model.to(device)

if num_gpus > 1:
    model = torch.nn.DataParallel(model)
    print("Model wrapped with DataParallel for multi-GPU usage.")

# Set the model to evaluation mode to disable dropout and other training-specific layers
model.eval()

# -----------------------------------------------------------------------------------
# Function Definitions
# -----------------------------------------------------------------------------------

def generate_embeddings_batch(texts, tokenizer, model, device, max_length=64):
    """
    Generates embeddings for a batch of texts using the GPT-2 model.

    Args:
        texts (list of str): List of text strings (e.g., questions) to generate embeddings for.
        tokenizer (GPT2Tokenizer): Tokenizer corresponding to the GPT-2 model.
        model (GPT2Model): Pre-trained GPT-2 model for generating embeddings.
        device (torch.device): The device (CPU or GPU) to perform computations on.
        max_length (int, optional): Maximum sequence length for tokenization. Defaults to 64.

    Returns:
        numpy.ndarray: Array of embeddings with shape (batch_size, hidden_size).
    """
    # Tokenize the batch of texts with padding and truncation
    inputs = tokenizer(
        texts,
        return_tensors='pt',
        max_length=max_length,
        truncation=True,
        padding='max_length'
    )
    
    # Move tokenized inputs to the designated device
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():  # Disable gradient calculations for efficiency
        outputs = model(**inputs)       
        last_hidden_state = outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size)
        embeddings = last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    
    return embeddings

# -----------------------------------------------------------------------------------
# Parameters and Data Loading
# -----------------------------------------------------------------------------------


batch_size = 128 

# Load the dataset from a CSV file into a Pandas DataFrame
df = pd.read_csv("/kaggle/input/tgif-qna-descriptions-38k/updated_final_dataframe_complete.csv")
embeddings = []

# -----------------------------------------------------------------------------------
# Generating Embeddings for the Dataset
# -----------------------------------------------------------------------------------

# Iterate over the DataFrame in batches to efficiently process large datasets
for i in tqdm(range(0, len(df), batch_size), desc="Generating Embeddings"):
    batch_df = df.iloc[i:i+batch_size]
    batch_texts = batch_df['question']
    batch_embeddings = generate_embeddings_batch(batch_texts.tolist(), tokenizer, model, device)
    embeddings.extend(batch_embeddings)

# -----------------------------------------------------------------------------------
# Saving the Embeddings
# -----------------------------------------------------------------------------------

# Add the generated embeddings as a new column in the DataFrame
df['question_embedding'] = embeddings
df.to_pickle('updated_final_df_with_q_embeddings.pkl')