In [None]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model
from tqdm import tqdm  # For progress bar

# Check if CUDA is available and the number of GPUs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-medium")
new_pad_token = "[PAD]"
if new_pad_token not in tokenizer.get_vocab():
    tokenizer.add_special_tokens({'pad_token': new_pad_token})
    print(f"Added new pad token: {new_pad_token}")
model = GPT2Model.from_pretrained("openai-community/gpt2-medium")
model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to accommodate new pad token

# Move the model to the device
model.to(device)

# If multiple GPUs are available, wrap the model with DataParallel
if num_gpus > 1:
    model = torch.nn.DataParallel(model)
    print("Model wrapped with DataParallel for multi-GPU usage.")

# Set the model to evaluation mode
model.eval()

# Function to generate embeddings for a batch of concatenated questions and answers
def generate_embeddings_batch(texts, tokenizer, model, device, max_length=64):
    # Tokenize the batch of texts
    inputs = tokenizer(
        texts,
        return_tensors='pt',
        max_length=max_length,
        truncation=True,
        padding='max_length'
    )
    # Move inputs to the device
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        # Get the last hidden state
        last_hidden_state = outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size)
        # Move to CPU and convert to numpy
        embeddings = last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    
    return embeddings

# Parameters for batching
batch_size = 128  # Adjust based on your GPU memory

# Load your DataFrame
df = pd.read_csv("/kaggle/input/tgif-qna-descriptions-38k/updated_final_dataframe_complete.csv")

print(f"DataFrame shape: {df.shape}")

embeddings = []

# Iterate over the DataFrame in batches
for i in range(0, len(df), batch_size):
    batch_df = df.iloc[i:i+batch_size]
    # Concatenate question and answer with the separator token
    batch_texts = batch_df['question']
    # Generate embeddings for the batch
    batch_embeddings = generate_embeddings_batch(batch_texts.tolist(), tokenizer, model, device)
    embeddings.extend(batch_embeddings)

# Add the embeddings to the DataFrame
df['question_embedding'] = embeddings

# Save the DataFrame with embeddings
df.to_pickle(f'updated_final_df_with_q_embeddings.pkl')
    
print(df.head())


# **** THE GIVEN CODE SHOWS THE PROCESS OF GENERATING QUESTION EMBEDDINGS

In [None]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model
from tqdm import tqdm  # For displaying progress bars during iteration

# -----------------------------------------------------------------------------------
# Configuration and Setup
# -----------------------------------------------------------------------------------

# Determine the computing device: use GPU if available, else fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get the number of GPUs available
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")

# -----------------------------------------------------------------------------------
# Loading the Pre-trained GPT-2 Model and Tokenizer
# -----------------------------------------------------------------------------------

# Initialize the GPT-2 tokenizer from the specified pre-trained model
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-medium")

# Define a new padding token
new_pad_token = "[PAD]"

# Add the new padding token to the tokenizer's vocabulary if it's not already present
if new_pad_token not in tokenizer.get_vocab():
    tokenizer.add_special_tokens({'pad_token': new_pad_token})
    print(f"Added new pad token: {new_pad_token}")

# Load the pre-trained GPT-2 model
model = GPT2Model.from_pretrained("openai-community/gpt2-medium")

# Resize the model's token embeddings to accommodate the new padding token
model.resize_token_embeddings(len(tokenizer))

# Move the model to the designated computing device (GPU or CPU)
model.to(device)

# If multiple GPUs are available, enable DataParallel for parallel processing
if num_gpus > 1:
    model = torch.nn.DataParallel(model)
    print("Model wrapped with DataParallel for multi-GPU usage.")

# Set the model to evaluation mode to disable dropout and other training-specific layers
model.eval()

# -----------------------------------------------------------------------------------
# Function Definitions
# -----------------------------------------------------------------------------------

def generate_embeddings_batch(texts, tokenizer, model, device, max_length=64):
    """
    Generates embeddings for a batch of texts using the GPT-2 model.

    Args:
        texts (list of str): List of text strings (e.g., questions) to generate embeddings for.
        tokenizer (GPT2Tokenizer): Tokenizer corresponding to the GPT-2 model.
        model (GPT2Model): Pre-trained GPT-2 model for generating embeddings.
        device (torch.device): The device (CPU or GPU) to perform computations on.
        max_length (int, optional): Maximum sequence length for tokenization. Defaults to 64.

    Returns:
        numpy.ndarray: Array of embeddings with shape (batch_size, hidden_size).
    """
    # Tokenize the batch of texts with padding and truncation
    inputs = tokenizer(
        texts,
        return_tensors='pt',
        max_length=max_length,
        truncation=True,
        padding='max_length'
    )
    
    # Move tokenized inputs to the designated device
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():  # Disable gradient calculations for efficiency
        outputs = model(**inputs)
        # Extract the last hidden state from the model's output
        last_hidden_state = outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size)
        # Compute the mean across the sequence length to obtain fixed-size embeddings
        embeddings = last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    
    return embeddings

# -----------------------------------------------------------------------------------
# Parameters and Data Loading
# -----------------------------------------------------------------------------------

# Define the batch size for processing data in chunks
batch_size = 128  # Adjust based on available GPU memory

# Load the dataset from a CSV file into a Pandas DataFrame
df = pd.read_csv("/kaggle/input/tgif-qna-descriptions-38k/updated_final_dataframe_complete.csv")

print(f"DataFrame shape: {df.shape}")

# Initialize a list to store the generated embeddings
embeddings = []

# -----------------------------------------------------------------------------------
# Generating Embeddings for the Dataset
# -----------------------------------------------------------------------------------

# Iterate over the DataFrame in batches to efficiently process large datasets
for i in tqdm(range(0, len(df), batch_size), desc="Generating Embeddings"):
    # Select a batch of rows from the DataFrame
    batch_df = df.iloc[i:i+batch_size]
    
    # Extract the 'question' column as the text input for embedding
    batch_texts = batch_df['question']
    
    # Generate embeddings for the current batch of texts
    batch_embeddings = generate_embeddings_batch(batch_texts.tolist(), tokenizer, model, device)
    
    # Append the generated embeddings to the main list
    embeddings.extend(batch_embeddings)

# -----------------------------------------------------------------------------------
# Saving the Embeddings
# -----------------------------------------------------------------------------------

# Add the generated embeddings as a new column in the DataFrame
df['question_embedding'] = embeddings

# Save the updated DataFrame with embeddings to a pickle file for efficient storage and retrieval
df.to_pickle('updated_final_df_with_q_embeddings.pkl')
    
# Display the first few rows of the updated DataFrame to verify the embeddings
print(df.head())
