# tokenizer

In [1]:

from transformers import LlamaTokenizer


base_model = "huggyllama/llama-13b"
tokenizer = LlamaTokenizer.from_pretrained(base_model,
                                           
                                          )

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# loop convs

In [2]:
import json
with open('extened_longchat_topiced_conversations.json', 'r', encoding='utf-8') as f:
    conv_list = json.load(f)
    


In [3]:
[x['TOPIC'] for x in conv_list]

['The psychology of happiness',
 'The benefits of mindfulness meditation',
 'The effects of climate change on ocean ecosystems',
 'The future of sustainable agriculture',
 'The history and culture of ancient civilizations',
 'The impact of social media on communication',
 'The role of education in society',
 'The benefits of regular exercise',
 'The impact of technology on human connection',
 'The future of renewable energy technology',
 'The psychology of creativity',
 'The impact of social media on mental health in adults',
 'The benefits of reading for pleasure',
 'The effects of stress on the body and mind',
 'The history and impact of the Renaissance',
 'The role of art in society',
 'The benefits of a plant-based diet',
 'The impact of social media on body image',
 'The future of space tourism',
 'The effects of sleep on overall health',
 'The role of music in society',
 'The benefits of volunteering',
 'The impact of technology on privacy and security',
 'The future of renewable

In [4]:
conv_list[0]

{'TOPIC_ID': 1,
 'TOPIC': 'The psychology of happiness',
 'CONVERSATION': "USER: I would like to discuss the topic of the psychology of happiness. \n ASSISTANT: Sure, I'd be happy to talk about that with you! What specifically would you like to know? \n USER: I'm interested in learning more about the factors that contribute to happiness. What are some things that people can do to increase their overall level of happiness? \n ASSISTANT: That's a great question! Some of the key factors that contribute to happiness include positive social relationships, engaging in meaningful activities, and finding a sense of purpose or fulfillment in life. Additionally, factors like good physical health, financial stability, and a sense of autonomy can also play a role in our overall happiness. \n USER: That's really interesting. What about the role of gratitude and mindfulness in happiness? \n ASSISTANT: Gratitude and mindfulness are both important factors that have been linked to increased levels of h

In [5]:
len(conv_list)

158

# gen topic eval dataset

In [6]:
import numpy as np
import random

np.random.seed(42) 

In [7]:
class Prompt:
    """the prompt used for testing, composed of multiple  """
    
    def __init__(self, id):
        self.id = id
        self.conv_list = []
        self.topic_list = []

    def add_conv(self, conv):
        self.conv_list.append(conv)
        self.topic_list.append(conv['TOPIC'])
    
    def assemble_prompt(self):
        
        self.retrieval_id = 1 
        
        record_prompt = "Below between '[[[' and ']]]' is a record of the previous conversations " + \
            f"on {len(self.topic_list)} different topics between the ASSISTANT and " + \
            "the USER. At the beginning of each topic, the USER will say " + \
            "'I would like to discuss the topic of <TOPIC>'. Memorize each " + \
            "<TOPIC>. At the end of the record, I will ask you to retrieve the " + \
            f"first topic. Now the record start. \nRECORD:\n[[["

        for conv in self.conv_list:
            record_prompt += conv['CONVERSATION']
            
        
        self.prompt = record_prompt

        self.prompt_postfix = f"]]]\nNow " + \
            f"the record ends. What is the first topic(s) in the record? Only give " + \
            "me the topic name. Do not summarize yourself.\nAnswer:" 

        return self.prompt, self.prompt_postfix, self.retrieval_id-1, self.topic_list[self.retrieval_id-1]
    

In [12]:
from tqdm import tqdm
import os

output_dir = "../"
num_test_samples = 20

ROWS = [130]
for num_topics in ROWS:

    prompt_list = []
    prompt_len_list = []
    
    for i in tqdm(range(num_test_samples)):
        prompt = Prompt(i)
        indices = np.random.choice(list(range(len(conv_list))), size=num_topics, replace=len(conv_list) < num_topics)

        for idx in indices:
            prompt.add_conv(conv_list[idx])
            
        prompt_list.append(prompt)
        
        prompt = None
    
    # write to output file
    avg_len = 0

    output_path = os.path.join(output_dir, f"{num_topics}_topics_extended_cnt{num_test_samples}.jsonl")
    with open(output_path, "w", encoding="utf-8") as f:
        for i, p in tqdm(enumerate(prompt_list)):
            pt, prompt_postfix, target_id, target_topic = p.assemble_prompt()

            prompt_len = len(tokenizer.encode(pt))

            prompt_len_list.append(prompt_len)

            avg_len += prompt_len/len(prompt_list)
            
            curr_output = {"test_id": p.id, 
                           "prompt": pt,
                           "prompt_postfix": prompt_postfix,
                           "target_id": target_id,
                           "target_topic": target_topic,
                           "topics": p.topic_list,
                           "prompt_length": prompt_len}
            json.dump(curr_output, f, ensure_ascii=False)
            f.write("\n")
            
            print(f"wrote prompt_length: {prompt_len}")

    print(f"saved {output_path}")

avg_len

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 6493.74it/s]
1it [00:00,  2.06it/s]

wrote prompt_length: 86843


2it [00:00,  2.15it/s]

wrote prompt_length: 86941


3it [00:01,  2.21it/s]

wrote prompt_length: 87157


4it [00:01,  2.26it/s]

wrote prompt_length: 87181


5it [00:02,  2.31it/s]

wrote prompt_length: 87464


6it [00:02,  2.32it/s]

wrote prompt_length: 87233


7it [00:03,  2.29it/s]

wrote prompt_length: 87847


8it [00:03,  2.26it/s]

wrote prompt_length: 87687


9it [00:03,  2.28it/s]

wrote prompt_length: 87117


10it [00:04,  2.26it/s]

wrote prompt_length: 87501


11it [00:04,  2.24it/s]

wrote prompt_length: 86544


12it [00:05,  2.24it/s]

wrote prompt_length: 87139


13it [00:05,  2.24it/s]

wrote prompt_length: 87159


14it [00:06,  2.23it/s]

wrote prompt_length: 87962


15it [00:06,  2.26it/s]

wrote prompt_length: 86984


16it [00:07,  2.20it/s]

wrote prompt_length: 87240


17it [00:07,  2.20it/s]

wrote prompt_length: 87324


18it [00:08,  2.20it/s]

wrote prompt_length: 86328


19it [00:08,  2.22it/s]

wrote prompt_length: 87841


20it [00:08,  2.24it/s]

wrote prompt_length: 86598
saved ../130_topics_extended_cnt20.jsonl





87204.49999999997

In [14]:
!wc -l ../130_topics_extended_cnt20.jsonl

      20 ../130_topics_extended_cnt20.jsonl


In [None]:
!head -n 1 ../130_topics_extended_cnt20.jsonl