In [2]:
# Import libraries
import pandas as pd
import csv
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Add eos token
def add_eos_for_dialogpt(prompt: str, eos_token: str) -> str:
  """ Adds the end-of-sequence token (EOS) to each utterance 
      in the prompt, except the last one, for dialogpt.
  """
  # Split the prompt by newline characters
  utterances = prompt.strip().split('\n')
  
  # Add EOS token to all but the last utterance
  formatted_utterances = [u + eos_token for u in utterances[:-1]] + [utterances[-1]]
  
  # Join the formatted utterances with newline characters
  return '\n'.join(formatted_utterances)

# Define function for generation 
def process_test_data(model_name, top_k=50):
  # Read the test.csv into a pandas DataFrame
  test = pd.read_csv('rephrase_test.csv', dtype={'prompt': str})
  prompt = test['prompt'][0]

  #test_sample = test.sample(5)
  test_sample = test

  # specify GPU
  device = torch.device("cuda")
  if device is None:
      device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print(f"Using device: {device}")

  # Define base path for models
  model_base_path = "finetuned_models"

  # Iterate over each prompt in the list
  for idx, row in test_sample.iterrows():
      prompt = row['prompt']

      # Conditionally reformat prompt based on model
      if model_name == 'dialogpt':
          eos_token = '<|endoftext|>'
          prompt = add_eos_for_dialogpt(prompt, eos_token)

      # Load model and tokenizer from specific folder
      model_path = f"{model_base_path}/{model_name}"  # Construct model path
      model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
      tokenizer = AutoTokenizer.from_pretrained(model_path)
      eos_token = tokenizer.eos_token_id 

      # Encode the prompt with EOS token
      input_ids = tokenizer(prompt, return_tensors='pt', add_special_tokens=True)["input_ids"].to(device)

      # Generate responses with top-k sampling
      output = model.generate(
          input_ids=input_ids,
          #max_length=100,
          max_new_tokens=7,
          num_return_sequences=5,
          #no_repeat_ngram_size=2,
          pad_token_id=tokenizer.eos_token_id,
          do_sample=True,
          top_k=top_k
      )

      # Decode and store the generated responses (temporary list)
      generated_responses = []
      for i in range(output.shape[0]):
          generated_response = tokenizer.decode(output[i], skip_special_tokens=True, clean_up_tokenization_spaces=False)
          generated_responses.append(generated_response)

      # Last utterance by C (for overlap calculation)
      last_utterance_by_c = prompt.split('\n')[-2]

      # Re-rank based on overlap with last utterance by C
      def overlap_score(response):
          response_words = set(response.split())
          last_utterance_words = set(last_utterance_by_c.split())
          return len(response_words.intersection(last_utterance_words))

      ranked_responses = sorted(generated_responses, key=lambda x: overlap_score(x), reverse=True)

      # Store the top 5 ranked responses
      for i in range(5):
          processed_response = ranked_responses[i].replace(prompt.replace('<|endoftext|>', ''), '').replace('\n', ' ')
          test_sample.at[idx, f'generated_response_{i+1}'] = processed_response

  # Save the sample DataFrame to a CSV file
  output_filename = f"{model_name}_generated_responses.csv"
  test_sample.to_csv(output_filename, index=False)

  print(f"Generated responses saved to {output_filename}")

In [None]:
# Generate responses for opt model
process_test_data('opt')