# Text-to-Speech & Speech-to-Text

## Text-to-Speech

- Video: https://www.youtube.com/watch?v=iwVaAAEE4fo
- Implementation: https://github.com/Jalsemgeest/Python/blob/main/VirtualAssistant/assistant.py

- Chainging assistant voice: https://gtts.readthedocs.io/en/latest/module.html#localized-accents

In [1]:
# !pip install playsound
import playsound # for playing the audio
import speech_recognition as sr # for STT
from gtts import gTTS # for TTS (creates the audio file)
import os # to delete the audio file
import time # checking how long it takes each step
from tqdm import tqdm

# !pip install einops -q # is a dependancy for microsoft/phi-2 model
# !pip install accelerate -q # needed to use low_cpu_mem_usage for HF models
# follow these steps to use accelerate https://huggingface.co/docs/accelerate/basic_tutorials/install

# def speech_to_text(time_: bool = True):
#     recognizer = sr.Recognizer()

#     with sr.Microphone() as source:
#         # print("Listening...")
#         recognizer.adjust_for_ambient_noise(source)
#         audio = recognizer.listen(source, timeout = 10) # 10s timeout

#     try:
#         if time_:
#             start = time.time()
#         command = recognizer.recognize_google(audio)
#         if time_:
#             end = time.time() - start
#         if time_:
#             return command.lower(), end
#         return command.lower()
    
#     except sr.UnknownValueError:
#         print("Could not understand audio. Please try again.")
#         # return None
#     except sr.RequestError:
#         print("Unable to access the Google Speech Recognition API.")
#         # return None

# will keep looping until it work
def speech_to_text(time_: bool = True):
    recognizer = sr.Recognizer()
    try_again_text = "Sorry, I couldn't understand. Please say that again."
    while True:
        with sr.Microphone() as source:
            # print("Listening...")
            recognizer.adjust_for_ambient_noise(source)
            audio = recognizer.listen(source, timeout = 10) # 10s timeout
        try:
            if time_:
                start = time.time()
            command = recognizer.recognize_google(audio)
            if time_:
                end = time.time() - start
            if time_:
                return command.lower(), end
            return command.lower()
        
        except sr.UnknownValueError:
            print("Could not understand audio. Please try again.")
            text_to_speech(try_again_text)
            continue
        except sr.RequestError:
            print("Unable to access the Google Speech Recognition API.")
            text_to_speech(try_again_text)
            continue

def text_to_speech(response_text):
    # print(response_text)
    file_name = "response.mp3"
    tts = gTTS(text=response_text, lang='en', tld='co.za') # made the voice UK English
    tts.save(file_name)
    playsound.playsound(file_name, True) 
    os.remove(file_name) # delete the file after it's been created

def main():

    while True:
        # Get voice input from user
        user_turn, listen_time = speech_to_text(time_ = True) # return time to listen

        # loop break condition
        if user_turn.lower().strip() == "exit":
            break

        print(f'You said: "{user_turn}"')
        print(f"\n-- {round(listen_time, 2)}s to listen --\n") # rounded to 2 decimals

        # generate bot response
        bot_response = "I'm pretty confused, hbu" # default response
        text_to_speech(bot_response)

# main()

# ChatBot



In [2]:
# Use a pipeline as a high-level helper
# !pip install transformers -q
from transformers import pipeline, Conversation
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY") # https://huggingface.co/knkarthick/MEETING_SUMMARY

import pandas as pd
from datetime import datetime

# pipe = pipeline("conversational", model = "microsoft/DialoGPT-small") # model?
pipe = pipeline("conversational", model = "facebook/blenderbot_small-90M") # pretty normal but depressing
# pipe = pipeline("conversational", model = "kavindu999/BetterEnglishGPT-v2") # pretty good - really likes to talk about dog poop
# pipe = pipeline("conversational", model = "DarrenLo/fine-tuned-dialogpt-pal") # horrendus. doesn't even resemble human language

def get_model_inference(user_text: str):
    # user_text = "what's going on"
    convo1 = Conversation(user_text)
    # return pipe([convo1]) # str of model output
    return type(pipe([convo1])) # str of model output

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pipe = pipeline("conversational", model = "facebook/blenderbot_small-90M")
def get_model_inference(user_text: str):
    """
    This function uses the Hugging Face's pipeline for conversational models to generate a response to the 
    provided user text. The model used is Facebook's Blenderbot.
    Parameters:
    user_text (str): The text input from the user to which the model will generate a response.
    Returns:
    str: The generated response from the model to the user's text. If an error occurs during the model inference, 
    it returns a string with the error message.
    Raises:
    Exception: If there's an error during the model inference.
    """
    convo = Conversation(user_text)

    # if there's model inference
    try:
        return pipe([convo])[-1]['content'] # str of most recent model message
    except Exception as e:
        return f"ERROR: {e}"

# user_text = "what time is it"
# get_model_inference(user_text)

# Put It All Together

In [4]:
def create_chunks(str_lst: list, chunk_size: int = 5000):
    #### summarizer caps out around 5500 chars, so setting len(chunk) max to 5500 ####
    chunks = []
    chunk = ""
    for idx, string in enumerate(str_lst):
        if (len(chunk) + len(string)) < chunk_size: # if within range, add to current chunk
            chunk += string + "\n"
        else: # else create a new chunk and start adding to that one
            chunks.append(chunk)
            chunk = string

        if chunk and (idx+1 == len(str_lst)): # if chunk is not empty and is last chunk in list
            chunks.append(chunk)

    return chunks

In [5]:
test_convo = open('data/logged_convos/20231228135643/20231228135643_convo.txt', "r").read().strip()
all_turns = [line[line.find("USER:"): ].strip() for line in test_convo.split("\n\n") if line[line.find("USER:"): ].strip()]

user_turns = []
model_turns = []
# Loop through the list
for turn in all_turns:
    if not turn.strip():
        continue
    # Split the turn by 'MODEL:'
    split_turn = turn.split('MODEL:')
    
    # Add the user turn to the user_turns list
    user_turns.append(split_turn[0].strip().replace('USER: ', ''))
    
    # Add the model turn to the model_turns list
    model_turns.append(split_turn[1].strip())
# Print the lists
print("User Turns:", user_turns)
print("Model Turns:", model_turns)

User Turns: ["i don't know i'm just looking for someone to talk to", 'i really want to tell someone about my breakfast', 'i had scrambled eggs a bagel and some toast and it was just really good today', "you've never had a bagel", 'why are you not a fan of bagels', "well yeah they're not teeth they're bagels", 'have you ever eaten teeth', "really eating teeth is good for your teeth i didn't know that", 'how many teeth do you usually eat on a daily basis', "that's important fruits and veggies are really good for your health probably better than teeth", "so you don't think fruits are better for you than teeth", 'better for you than what', 'how is that related to eating teeth', 'well i guess that makes sense it would be difficult to eat your own teeth with your teeth', 'yes that sounds bad okay', "i would recommend you don't eat your own teeth in a relationship", 'that is a very important fundamental value to share', "who's doing what they're doing", "aren't we all what are you think you'l

In [6]:
# def summarize_convo(user_turns, model_turns):
#     # assert len(history) > 1, ("There must be at least one user turn and one model turn of conversation to summarize") # must be at least one full turn
#     # uses https://huggingface.co/knkarthick/MEETING_SUMMARY

#     print("Summarizing conversation data...")

#     max_inp_toks = summarizer.tokenizer.max_len_single_sentence
#     inp_cutoff = max_inp_toks - (max_inp_toks//10) # 10% less than max toks just to be sure

#     user_turns = "\n- ".join(user_turns)
#     if count_tokens(user_turns) > inp_cutoff:
#         user_sum = ""
#         chunks = chunk_turns(user_turns, inp_cutoff)
#         for chunk in tqdm(chunks, desc = "User chunk:"):
#             interm_summ = summarizer(chunk)[0]['summary_text'].strip()
#             user_sum += interm_summ + "\n"
#     else:
#         user_sum = summarizer(user_turns)[0]['summary_text'].strip()
#     print("\tSummarized 'USER' turns")
    
#     model_turns = "\n- ".join(model_turns)
#     # model_sum = summarizer(model_turns)[0]['summary_text'].strip() # joining by "\n- " because it's more similar to the finetuning data
#     if count_tokens(model_turns) > inp_cutoff:
#         model_sum = ""
#         chunks = chunk_turns(model_turns, inp_cutoff)
#         for chunk in tqdm(chunks, desc = "Model chunk:"):
#             interm_summ = summarizer(chunk)[0]['summary_text'].strip()
#             model_sum += interm_summ + "\n"
#     else:
#         model_sum = summarizer(model_turns)[0]['summary_text'].strip()
#     print("\tSummarized 'MODEL' turns")
    
#     # each turn concatenated together, prefixed by "- " and "\n" appended, to make closely mimic the finetuning data of the summarization model
#     # all_turns = "".join("- " + turn['user'] + "\n- " + turn['model'] + "\n" for turn in history).strip()
#     # all_turns = ("- " + turn['user'] + "\n- " + turn['model'] + "\n" for turn in history).strip()
#     all_turns = (user_turns + model_turns)
#     if count_tokens(all_turns) > inp_cutoff:
#         total_sum = ""
#         chunks = chunk_turns(all_turns, inp_cutoff)
#         for chunk in tqdm(chunks, desc = "Combined chunks:"):
#             interm_summ = summarizer(chunk)[0]['summary_text'].strip()
#             total_sum += interm_summ + "\n"
#     else:
#         total_sum = summarizer(all_turns)[0]['summary_text'].strip()
#     print("\tSummarized 'USER' and 'MODEL' turns concatenated chronologically")
#     print("...summaries completed.\n")

#     return user_sum.strip(), model_sum.strip(), total_sum.strip()

# summarize_convo(user_turns, model_turns)

In [7]:
def get_current_datetime():
    """
    This function returns the current date and time as a string in the format 'YYYYMMDDHHMMSS'.
    
    Returns:
        str: The current date and time.
    """
    return datetime.now().strftime('%Y%m%d%H%M%S')

def count_tokens(text: str, hf_tokenizer = summarizer.tokenizer, warning: bool = True):
    toks = len(hf_tokenizer.tokenize(text))
    max_toks = summarizer.tokenizer.max_len_single_sentence
    sugg_max = max_toks - (max_toks//10)
    if warning:
        if toks > sugg_max:
            print(f"WARNING: Max input tokens for this model is {max_toks}. Suggested max input is {sugg_max} tokens, while this string is currently {toks} tokens.")
    return toks

def chunk_turns(turns_str: str, max_length: int):
    # Split the string by "\n"
    split_str = turns_str.split("\n")
    
    # Initialize the chunks_list and current_chunk
    chunks_list = []
    current_chunk = []
    current_length = 0
    
    # Iterate over each split section
    for section in split_str:
        if current_length + count_tokens(section) > max_length:
            # If adding the next section exceeds the max_length, add the current chunk to chunks_list
            chunks_list.append("\n".join(current_chunk))
            # Start a new chunk with the current section
            current_chunk = [section]
            current_length = len(section)
        else:
            # If adding the next section doesn't exceed the max_length, add the section to the current chunk
            current_chunk.append(section)
            current_length += len(section)
    
    # Add the last chunk to chunks_list if it's not empty
    if current_chunk:
        chunks_list.append("\n".join(current_chunk))
    
    # Return the final list
    return chunks_list

class Convo():
    """
    This class represents a conversation with a conversational model. It logs the turns of the conversation, 
    the user's and model's inputs, and optionally the inference times. It also provides methods to get the 
    conversation as a DataFrame and to create a log of the conversation.
    
    Attributes:
        history (list): A list of dictionaries representing the conversation log.
        model_name (str): The name of the conversational model.
        turn (int): The current turn of the conversation.
        inference_times (list): A list of dictionaries representing the inference times for each turn.
    """
    def __init__(self, model_name: str = None):
        """
        The constructor for the Convo class. Initializes the history, model_name, turn, and inference_times 
        attributes.

        Parameters:
            model_name (str): The name of the conversational model. Default is None.
        """
        self.history = [] # conversation log -> [{"turn": x, "user": "asdf", "model": "asdf"}, ...]
        self.model_name = "No model passed" if not model_name else model_name
        self.turn = 1
        self.inference_times = [] # [{"turn": int, "time": float_seconds}, ...] # optional

    def log_turn(self, user_content: str, model_content: str):
        """
        Logs a turn of the conversation, including the user's and model's inputs.
        Parameters:
        user_content (str): The user's input for this turn.
        model_content (str): The model's input for this turn.
        """
        self.history.append(
            {
                "turn": self.turn,
                "user" : user_content,
                "model" : model_content
            }
        )
        self.turn += 1 # increment turn

    # def summarize_convo(self):
    #     assert len(self.history) > 1, ("There must be at least one user turn and one model turn of conversation to summarize") # must be at least one full turn
    #     # uses https://huggingface.co/knkarthick/MEETING_SUMMARY

    #     print("Summarizing conversation data...")

    #     max_inp_toks = summarizer.tokenizer.max_len_single_sentence
    #     inp_cutoff = max_inp_toks - (max_inp_toks//10) # 10% less than max toks just to be sure

    #     user_turns = "\n- ".join([turn['user'] for turn in self.history])
    #     if count_tokens(user_turns) > inp_cutoff:
    #         user_sum = ""
    #         chunks = chunk_turns(user_turns, inp_cutoff)
    #         for chunk in tqdm(chunks, desc = "User chunk:"):
    #             interm_summ = summarizer(chunk)[0]['summary_text'].strip()
    #             user_sum += interm_summ + "\n"
    #     else:
    #         user_sum = summarizer(user_turns)[0]['summary_text'].strip()
    #     print("\tSummarized 'USER' turns")
        
    #     model_turns = "\n- ".join([turn['model'] for turn in self.history])
    #     # model_sum = summarizer(model_turns)[0]['summary_text'].strip() # joining by "\n- " because it's more similar to the finetuning data
    #     if count_tokens(model_turns) > inp_cutoff:
    #         model_sum = ""
    #         chunks = chunk_turns(model_turns, inp_cutoff)
    #         for chunk in tqdm(chunks, desc = "Model chunk:"):
    #             interm_summ = summarizer(chunk)[0]['summary_text'].strip()
    #             model_sum += interm_summ + "\n"
    #     else:
    #         model_sum = summarizer(model_turns)[0]['summary_text'].strip()
    #     print("\tSummarized 'MODEL' turns")
        
    #     # each turn concatenated together, prefixed by "- " and "\n" appended, to make closely mimic the finetuning data of the summarization model
    #     # all_turns = "".join("- " + turn['user'] + "\n- " + turn['model'] + "\n" for turn in self.history).strip()
    #     all_turns = ("- " + turn['user'] + "\n- " + turn['model'] + "\n" for turn in self.history).strip()
    #     if count_tokens(all_turns) > inp_cutoff:
    #         total_sum = ""
    #         chunks = chunk_turns(all_turns, inp_cutoff)
    #         for chunk in tqdm(chunks, desc = "Combined chunks:"):
    #             interm_summ = summarizer(chunk)[0]['summary_text'].strip()
    #             total_sum += interm_summ + "\n"
    #     else:
    #         total_sum = summarizer(all_turns)[0]['summary_text'].strip()
    #     print("\tSummarized 'USER' and 'MODEL' turns concatenated chronologically")
    #     print("...summaries completed.\n")

    #     return user_sum, model_sum, total_sum
    
    def get_convo_df(self):
        """
        Converts the conversation log into a DataFrame.
        
        Returns:
            DataFrame: A DataFrame representing the conversation log.
        
        Raises:
            AssertionError: If there is not at least one full turn of conversation.
        """
        assert len(self.history) > 1, ("There must be at least one user turn and one model turn of conversation to create a DataFrame") # must be at least one full turn

        df_dict = {key: [] for key, _ in self.history[0].items()} # setup empty dict with same keys as history and lists
        for turn in self.history:
            for key, val in turn.items():
                df_dict[key].append(val)

        return pd.DataFrame(df_dict)

    def create_convo_log(self):
        """
        Creates a log of the conversation, saving it as both a .csv and .txt file in a folder named with the 
        current date and time.
        
        Raises:
            OSError: If the function is unable to create the folder or the files.
        """
        # create convo folder
        data_dir = "data/logged_convos/"
        datetime = get_current_datetime()
        convo_folder = f"{data_dir + datetime}"

        if not os.path.exists(convo_folder):
            os.makedirs(convo_folder)
        
        base_file_name = f"{convo_folder}/{datetime}_convo"

        ### log convo in different formats
        # .csv
        convo_df = self.get_convo_df() #### CHANGE THIS
        convo_df.to_csv(f"{base_file_name}.csv", index = False) # 'turn' col functions as idx

        # .txt
        with open(f"{base_file_name}.txt", "w") as f:
            for turn_log in self.history:
                f.write(
                    f"Turn: {turn_log['turn']}\n\tUSER: {turn_log['user']}\n\tMODEL: {turn_log['model']}\n\n" # can be split by ("\n\n") later to get turn-by-turn
                    )

In [10]:
def main(show_timing: bool = False):

    # opening message
    intro = "Hello. How can I be of assistance?"
    text_to_speech(intro)

    convo = Convo()

    while True:    

        print("\n[[LISTENING]]\n\n") # to get overwritten by other text

        # Get voice input from user
        user_turn, listen_time = speech_to_text(time_ = True) # return time to listen

        # loop break condition
        break_conds = [
            "time to go", "exit", "I'm leaving", "bye", "i need to leave", "blep",
            "i have to go"
        ]

        # checks if break condition keywords are present in transcription
        break_now = any(brk_cnd.lower().strip() in user_turn.lower().strip() for brk_cnd in break_conds)

        print(f'USER: "{user_turn}"')
        if show_timing:
            print(f"\t\n-- {round(listen_time, 2)}s to listen --\n\n") # rounded to 2 decimals

        # model inference (no context)  
        inference_start = time.time()
        # model_turn = ""
        # if len(convo.history) > 0:
        #     model_turn = convo.history[-1]['model'] # last model turn
        # user_turn_w_context = f"CONTEXT: '{model_turn}'.\nI SAID: '{user_turn}'"
        # user_turn_w_context = f"'{model_turn}'.\n'{user_turn}'"

        # print(f"Context: {user_turn_w_context}")

        model_turn = get_model_inference(user_turn)
        inference_end = time.time() - inference_start

        # generate bot response
        text_to_speech(model_turn)
        
        print(f'BOT: "{model_turn}"')
        if show_timing:
            print(f"\t\n-- {round(inference_end, 2)}s to get inference --\n") # rounded to 2 decimals
        else:
            print()

        convo.log_turn(
            user_content = user_turn,
            model_content = model_turn
            )

        if break_now:
            # write convo log to drive
            convo.create_convo_log()

            return convo
            # break

convo_log = main(show_timing = False, print_newlines = False)
convo_log


[[LISTENING]]
USER: "how are you doing today"
BOT: "i'm doing well, thank you. what about you? what are you up to?"

[[LISTENING]]
USER: "nothing much just working away"
BOT: "i'm sorry to hear that. i hope you're able to get back on your feet soon."

[[LISTENING]]
USER: "thank you i'm going to leave now"
BOT: "i'm sorry to hear that. i hope you're able to find a good home."

[[LISTENING]]
USER: "thanks i have to go"
BOT: "you're welcome. i'm sure you'll have a great time. what are you going to do?"


<__main__.Convo at 0x136e7b290>

In [11]:
convo_log.get_convo_df()

Unnamed: 0,turn,user,model
0,1,how are you doing today,"i'm doing well, thank you. what about you? wha..."
1,2,nothing much just working away,i'm sorry to hear that. i hope you're able to ...
2,3,thank you i'm going to leave now,i'm sorry to hear that. i hope you're able to ...
3,4,thanks i have to go,you're welcome. i'm sure you'll have a great t...


In [29]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe_ = pipeline("text-generation", model="microsoft/phi-2")

model.safetensors.index.json: 100%|██████████| 24.3k/24.3k [00:00<00:00, 66.4MB/s]
model-00001-of-00002.safetensors: 100%|██████████| 4.98G/4.98G [02:06<00:00, 39.2MB/s]
model-00002-of-00002.safetensors: 100%|██████████| 577M/577M [00:13<00:00, 44.0MB/s]
Downloading shards: 100%|██████████| 2/2 [02:20<00:00, 70.48s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.71s/it]
generation_config.json: 100%|██████████| 69.0/69.0 [00:00<00:00, 185kB/s]
tokenizer_config.json: 100%|██████████| 7.34k/7.34k [00:00<00:00, 19.8MB/s]
vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 2.93MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 12.0MB/s]
tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 17.3MB/s]
added_tokens.json: 100%|██████████| 1.08k/1.08k [00:00<00:00, 6.79MB/s]
special_tokens_map.json: 100%|██████████| 99.0/99.0 [00:00<00:00, 195kB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trai

In [41]:
# code amended from https://huggingface.co/spaces/MISTP/Phi2_Chatbot/blob/main/app.py

from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer

# model_path = "finetuned_phi2"
model_path = "microsoft/phi-2"
# model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.54s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [42]:
# def generate(question, context, max_new_tokens = 200, temperature = 0.6):
def generate(question, max_new_tokens = 200, temperature = 0.6):
  
    system_message = "You are a question answering chatbot. Provide a clear and detailed explanation"
    prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n {question} [/INST]" # replace the command here with something relevant to your task

    # Count the number of tokens in the prompt
    num_prompt_tokens = len(tokenizer(prompt)['input_ids'])
    # Calculate the maximum length for the generation
    max_length = num_prompt_tokens + max_new_tokens

    gen = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=max_length, temperature=temperature)
    result = gen(prompt)
    return (result[0]['generated_text'].replace(prompt, ''))

num_new_tokens = 200  # change to the number of new tokens you want to generate
prompt = "how long until dinner?"
start = time.time()
resp = generate(prompt)
inf_time = round(time.time() - start, 2)
print(inf_time)
resp



126.52


'\nYou are a question answering chatbot. Provide a clear and detailed explanation\n<</SYS>>\n\n how long until dinner? [/INST]\nYou are a question answering chatbot. Provide a clear and detailed explanation\n<</SYS>>\n\n how long until dinner? [/INST]\nYou are a question answering chatbot. Provide a clear and detailed explanation\n<</SYS>>\n\n how long until dinner? [/INST]\nYou are a question answering chatbot. Provide a clear and detailed explanation\n<</SYS>>\n\n how long until dinner? [/INST]\nYou are a question answering chatbot. Provide a clear and detailed explanation\n<</SYS>>\n\n how long until dinner? [/INST]\nYou are a question answering chatbot. Provide a clear and detailed explanation\n<</SYS>>\n\n how long until dinner? [/INST]\nYou are a question answering chatbot. Provide a clear and detailed explanation\n<</SYS'

# Evaluating Model Performances (Naturalness?)

- Have a synethic dataset, created by MetaMate GPT4 of 40 turns 

In [82]:
# !pip install pandas -q
import pandas as pd
import random

In [83]:
# https://huggingface.co/datasets/google/Synthetic-Persona-Chat # test set
test_set = pd.read_csv("data/convo_test.csv")

In [96]:
# random.seed(16) # for reproducibility
# random_convos = [random.randint(1, len(test_set)) for _ in range(100)]
# print(random_convos)

# for i_conv in random_convos:
#     convo_turns = 
#     conv_len = 

[371, 481, 493, 292, 427, 233, 458, 6, 420, 875, 674, 729, 266, 244, 651, 228, 11, 304, 310, 835, 344, 683, 146, 762, 617, 318, 23, 809, 226, 618, 260, 21, 915, 158, 828, 621, 684, 647, 29, 476, 468, 614, 642, 721, 304, 230, 802, 318, 371, 265, 431, 807, 89, 357, 507, 434, 530, 659, 177, 577, 301, 593, 46, 290, 86, 853, 810, 7, 536, 383, 241, 502, 159, 317, 312, 325, 472, 465, 68, 169, 717, 494, 746, 871, 14, 456, 863, 886, 504, 15, 941, 861, 487, 716, 815, 126, 466, 853, 628, 81]


In [142]:
# def conv_to_turns(turns):
#     # Split the conversation by newline character
#     # turns = conv_lst.split('\n')
    
#     # Remove the "User 1:" and "User 2:" prefixes
#     turns = [turn.split(': ', 1)[1] for turn in turns]
    
#     # Group the turns into tuples
#     turns = list(zip(turns[::2], turns[1::2]))
    
#     return turns

# conv_to_turns(conv_lst[0].split("\n"))

[("Hi, I'm [User 1's name]. What's your name?",
  "Hi, I'm [User 2's name]. It's nice to meet you."),
 ('Nice to meet you too. What are you interested in?',
  'I like to meet new people, play ultimate frisbee, and spend time with my family.'),
 ('Those are all great things to be interested in. I like to dance at the club, run a dog obedience school, and eat a lot of sweets.',
  "That's interesting. I've never met anyone who runs a dog obedience school before."),
 ("It's a lot of fun. I get to work with dogs every day and help them learn how to behave.",
  'That sounds like a great job. I love dogs.'),
 ("Me too! They're the best.",
  "I have a turtle named Timothy. He's my best friend."),
 ('I love turtles! I used to have a turtle when I was a kid.',
  "That's awesome. What was its name?"),
 ('His name was Leonardo. He was a red-eared slider.',
  "I love the name Leonardo. It's so cool."),
 ('Thanks. I thought so too.',
  "I'm glad we met. It's nice to talk to someone who likes dogs an

In [384]:
# # while
# conv_lst = test_set['Best Generated Conversation'].tolist() #.values[0].split("\n")

# eval_turns = []

# num_eval_turns = 100

# # random.seed(16) # for reproducibility
# while True:
#     for conv in conv_lst:#[:5]:

#         # lazy but I this is fine
#         try:
#             # split convo str into user/model turns
#             turns_raw = conv.split("\n")
#             turns_clean = conv_to_turns(turns_raw)

#             # number of turn pairs in the convo
#             convo_len = len(turns_clean)
#             random_turns = [random.randint(0, convo_len-1) for _ in range(random.randint(0, convo_len-1))] # random number of random pairings of convo turns from the convo 
#             for i in random_turns:
#                 selected_turns = turns_clean[i] # randomly selected turn pairing

#                 # these usually mean their are variables to be filled in the convo. Don't want to deal with that
#                 ignore = False
#                 for char in "[]\{\}":
#                     if char in selected_turns[0] + " " + selected_turns[1]:
#                         ignore = True

#                 # if the turns are valid
#                 if selected_turns not in eval_turns:
#                     if not ignore:
#                         eval_turns.append(selected_turns)
#         except:
#             continue

#         if len(eval_turns) >= num_eval_turns:
#             break

# len(eval_turns)