cycles through each combination of options and outputs a CSV of num_convos conversations, their LL difference scores, and their similarity scores

In [None]:
!pip install transformers==4.22.2
import transformers
print(transformers.__version__)
# downgrading is necessary https://stackoverflow.com/questions/74748116/huggingface-automodelforcasuallm-decoder-only-architecture-warning-even-after

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.nn.functional import cross_entropy

import numpy as np
import pandas as pd

import logging

# Suppress the warning
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

Collecting transformers==4.22.2
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.9.0 (from transformers==4.22.2)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1 (from transformers==4.22.2)
  Downloading tokenizers-0.12.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.16.4 tokenizers-0.12.1 transformers-4.22.2
4.22.2


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
import torchtext
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import torch.nn.functional as F

base_dir = '/content/gdrive/MyDrive/NLP Project/'

cache_dir: str = base_dir + 'GloveCache'
glove = torchtext.vocab.GloVe('6B', cache=cache_dir)

Mounted at /content/gdrive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def similarity_score(sentence1, sentence2):
    '''computes similarity score between 2 sentences'''
    # Preprocess and tokenize sentences
    tokens1 = word_tokenize(sentence1.lower())
    tokens2 = word_tokenize(sentence2.lower())

    # Get glove word embeddings for each token
    sentence1_embeddings = [glove[token] for token in tokens1 if token in glove.stoi]
    sentence2_embeddings = [glove[token] for token in tokens2 if token in glove.stoi]

    # Compute sentence embeddings by averaging word embeddings
    sentence1_embedding = torch.mean(torch.stack(sentence1_embeddings), dim=0)
    sentence2_embedding = torch.mean(torch.stack(sentence2_embeddings), dim=0)

    # Compute cosine similarity
    similarity = F.cosine_similarity(sentence1_embedding.unsqueeze(0), sentence2_embedding.unsqueeze(0))

    return similarity.item()

In [None]:
def average_similarity_score(sentence_list):
    # Compute similarity score between each pair of adjacent sentences in the list and return their average
    similarity_scores = []

    # Iterate through adjacent sentence pairs
    for i in range(len(sentence_list) - 1):
        sentence1 = sentence_list[i]
        sentence2 = sentence_list[i + 1]

        similarity_scores.append(similarity_score(sentence1, sentence2))

    # Calculate average similarity score
    average_similarity = sum(similarity_scores) / len(similarity_scores)

    return average_similarity

In [None]:
import string

def keep_sentences_until_punctuation(sentences):
    # Remove sentences that contain punctuation as the first character
    result = []
    for sentence in sentences:
        if all(char in string.punctuation or char.isspace() for char in sentence):
            break
        if sentence[0] in string.punctuation:
            break
        result.append(sentence)
    return result

In [None]:
def generate_conversation(model1_name, model2_name, print_output=True, default_input=None):
    '''
    returns conversation in the form of a list of strings
    '''
    model1_path = base_dir + model1_name
    model2_path = base_dir + model2_name

    if model1_name == "microsoft/DialoGPT-small":
        # default model
        tokenizer1 = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
        model1 = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
    else:
        # custom model
        tokenizer1 = AutoTokenizer.from_pretrained('/content/gdrive/MyDrive/NLP Project/' + model1_name, padding_size="left")
        model1 = AutoModelForCausalLM.from_pretrained(model1_path)


    if model2_name == "microsoft/DialoGPT-small":
        # default model
        tokenizer2 = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
        model2 = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
    else:
        # custom model
        tokenizer2 = AutoTokenizer.from_pretrained('/content/gdrive/MyDrive/NLP Project/' + model2_name, padding_size="left")
        model2 = AutoModelForCausalLM.from_pretrained(model2_path)

    model_path = None
    tokenizer = tokenizer1
    sentence_list = []

    for step in range(10): # 5 calls and responses
        if step == 0:
            if default_input == None:
                # If it's the first step, prompt the model with a user input
                user_input = input(">> Please input a prompt to get the conversation going: ")
                sentence_list.append(user_input)
                user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')
                bot_input_ids = user_input_ids
            else:
                sentence_list.append(default_input)
                user_input_ids = tokenizer.encode(default_input + tokenizer.eos_token, return_tensors='pt')
                bot_input_ids = user_input_ids
        else:
            # The model's previous response becomes the input to the other model
            bot_input_ids = chat_history_ids

        # alternate between model1 and model2
        if step % 2 == 1:
            model = model2
            model_path = model2_path
            tokenizer = tokenizer1
        else:
            model = model1
            model_path = model1_path
            tokenizer = tokenizer2

        # generated a response while limiting the total chat history to 1000 tokens,
        chat_history_ids = model.generate(
            bot_input_ids,
            max_length=10000,
            pad_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=3,
            do_sample=True,
            top_k=100,
            top_p=0.7,
            temperature=0.7)
        sentence_list.append(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))
        # pretty print last output tokens
        if print_output:
            print(f"{model_path.split('/')[-1]} (Model #{step % 2 + 1}): {tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)}")

    sentence_list = keep_sentences_until_punctuation(sentence_list)

    return sentence_list

# Model Argument Notes

no_repeat_ngram_size: This argument controls the generation of repeated n-grams in the output. It specifies the maximum size of n-grams that should not be repeated. For example, if set to 3, the model will avoid generating repeated sequences of three or more tokens.

do_sample: When set to True, this argument enables random sampling during the generation process. It allows the model to explore different possible outputs instead of always selecting the most likely token.

top_k: This argument limits the number of highest probability tokens to consider during random sampling. It defines the size of the "top-k" probability distribution from which tokens are sampled. A higher value allows more diversity in the generated output.

top_p: Also known as "nucleus" or "top-p" sampling, this argument sets a cumulative probability threshold for the random sampling process. It restricts the sampling to the smallest possible set of tokens whose cumulative probability exceeds the threshold.

temperature: This argument controls the randomness of the sampling process. A higher temperature value, such as 1.0, makes the output more random, while a lower value, such as 0.8, makes it more focused and deterministic.

In [None]:
def create_adjacent_pairs(sentences_list):
    ''' return list of tuples of adjacent sentences'''
    pairs_list = []
    num_sentences = len(sentences_list)

    for i in range(num_sentences - 1):
        sentence_pair = (sentences_list[i], sentences_list[i + 1])
        pairs_list.append(sentence_pair)

    return pairs_list

In [None]:
def compute_log_likelihoods(input_sentence, target_sentence):
    '''
    returns the difference between the marginal and conditional log-likelihoods
    we almost always expect a positive number because the marginal case will always be more likely
    the closer this difference (the smaller the positive number), the better the coherence between the two sentences is
    '''
    # Load pre-trained GPT-2 model and tokenizer
    model_name = "gpt2-medium"  # Adjust based on the size of GPT-2 you want
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    model.eval()

    # For conditional log-likelihood
    concatenated = input_sentence + ' ' + target_sentence
    input_ids_concatenated = tokenizer.encode(concatenated, return_tensors='pt')

    with torch.no_grad():
        outputs_concatenated = model(input_ids_concatenated).logits[:, :-1, :]

    target_ids_concatenated = input_ids_concatenated[:, 1:]
    conditional_log_likelihood = -cross_entropy(outputs_concatenated.contiguous().view(-1, outputs_concatenated.size(2)), target_ids_concatenated.contiguous().view(-1), reduction='sum').item()

    # For marginal log-likelihood
    input_ids_target = tokenizer.encode(target_sentence, return_tensors='pt')

    with torch.no_grad():
        outputs_target = model(input_ids_target).logits[:, :-1, :]

    target_ids_target = input_ids_target[:, 1:]
    marginal_log_likelihood = -cross_entropy(outputs_target.contiguous().view(-1, outputs_target.size(2)), target_ids_target.contiguous().view(-1), reduction='sum').item()

    return marginal_log_likelihood - conditional_log_likelihood

In [None]:
def log_likelihood_difference(sentences_list):
    ll_list = []

    for i in range(len(sentences_list) - 1):
        sentence1 = sentences_list[i]
        sentence2 = sentences_list[i + 1]
        ll_list.append(compute_log_likelihoods(sentence1, sentence2))

    return ll_list

In [None]:
def cycle_through_all(models, num_convos):
    '''
    generates num_convos conversations for each combination of combos in models
    creates csv for each conversation including LL difference score and similarity scores
    '''
    for model1 in models:
        for model2 in models:
            model1_name = model1
            model2_name = model2

            print(f"now processing {model1} with {model2}")

            # generate convos
            convos = [generate_conversation(model1_name,
                                            model2_name,
                                            print_output=False,
                                            default_input="What is the meaning of life, the universe, and everything?") for _ in range(num_convos)]

            # get similarity scores
            try:
                # might fail on RandomGPT
                convos_similiarity_scores = [average_similarity_score(convo) for convo in convos]
            except:
                convos_similiarity_scores = ['N/A' for convo in convos]

            # get mean log-likelihood difference over all convos
            average_LL_score_over_all_convos = 0
            average_LL_scores = []
            for convo in convos:
                average_LL_scores.append(np.mean(log_likelihood_difference(convo)))

            # create output df
            max_sentences = max(len(sublist) for sublist in convos)
            data_dict = {f"Sentence_{i+1}": [sublist[i] if i < len(sublist) else None for sublist in convos] for i in range(max_sentences)}
            df = pd.DataFrame(data_dict)
            df['LL Difference Scores'] = average_LL_scores
            df['Average Similarity Scores'] = convos_similiarity_scores

            if model1_name == "microsoft/DialoGPT-small":
                model1_name = "DialoGPT-small"
            if model2_name == "microsoft/DialoGPT-small":
                model2_name = "DialoGPT-small"

            # save df to drive
            from google.colab import drive
            drive.mount("/content/gdrive", force_remount=True)
            #df.to_csv(base_dir + "CSVs/" + f"{model1_name.split('GPT')[0]}with{model2_name.split('GPT')[0]}.csv", index=False)
            df.to_csv(base_dir + "CSVs/" + f"{model1_name.replace('-', '')}with{model2_name.replace('-', '')}.csv", index=False)

            '''As an example for the paper, give the LL difference and Similarity Score between each sentence for a single conversation'''
            adjacent_LL_scores = []
            adjacent_SS_scores = []
            sentence_pairs = create_adjacent_pairs(convos[0])
            for sentence_pair in sentence_pairs:
                adjacent_LL_scores.append(compute_log_likelihoods(sentence_pair[0], sentence_pair[1]))
                try:
                    adjacent_SS_scores.append(similarity_score(sentence_pair[0], sentence_pair[1]))
                except:
                    adjacent_SS_scores.append('N/A')
            first_example_df = pd.DataFrame(sentence_pairs, columns=['Input Sentence', 'Next Sentence'])
            first_example_df['LL Difference Scores'] = adjacent_LL_scores
            first_example_df['Average Similarity Scores'] = adjacent_SS_scores

            # save first example df to drive
            from google.colab import drive
            drive.mount("/content/gdrive", force_remount=True)
            #first_example_df.to_csv(base_dir + "CSVs/" + f"{model1_name.split('GPT')[0]}with{model2_name.split('GPT')[0]}EXAMPLE.csv", index=False)
            first_example_df.to_csv(base_dir + "CSVs/" + f"{model1_name.replace('-', '')}with{model2_name.replace('-', '')}EXAMPLE.csv", index=False)

In [None]:
# models = ['StarWarsGPT-small', 'ShakespeareGPT-small','OfficeGPT-large', 'microsoft/DialoGPT-small']
# num_convos = 30

# cycle_through_all(models, num_convos)

now processing StarWarsGPT-small with StarWarsGPT-small
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing StarWarsGPT-small with ShakespeareGPT-small
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing StarWarsGPT-small with OfficeGPT-large
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing StarWarsGPT-small with microsoft/DialoGPT-small
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing ShakespeareGPT-small with StarWarsGPT-small
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing ShakespeareGPT-small with ShakespeareGPT-small
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing ShakespeareGPT-small with OfficeGPT-large
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing ShakespeareGPT-small with microsoft/DialoGPT-small
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing OfficeGPT-large with StarWarsGPT-small
Mounted at /content/gdrive
Mounted at 

In [None]:
# large: full (~60000 lines)
# medium: 1/3 (~20000 lines)
# small: 1/10 (~6000 lines)
# extra-small 1/100 (~600 lines)
cycle_through_all(['OfficeGPT-large',
                   'OfficeGPT-medium',
                   'OfficeGPT-small',
                   'OfficeGPT-extra-small'], 30)

now processing OfficeGPT-large with OfficeGPT-large


Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Mounted at /content/gdrive
Mounted at /content/gdrive
now processing OfficeGPT-large with OfficeGPT-medium
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing OfficeGPT-large with OfficeGPT-small
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing OfficeGPT-large with OfficeGPT-extra-small
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing OfficeGPT-medium with OfficeGPT-large
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing OfficeGPT-medium with OfficeGPT-medium
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing OfficeGPT-medium with OfficeGPT-small
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing OfficeGPT-medium with OfficeGPT-extra-small
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing OfficeGPT-small with OfficeGPT-large
Mounted at /content/gdrive
Mounted at /content/gdrive
now processing OfficeGPT-small with OfficeGPT-medium
Mounted at /content/gdrive
