## Data Setup

In [2]:
task_que = '(conversation)'
system_message = "You are an expert in generating realistic 2-party dialogs."
# system_message = ""

### Fetch Tweets

In [3]:
import pandas as pd

data_path = "../datasets/sample-tweets.csv"
df = pd.read_csv(data_path)#, delimiter='\t')

### Build User profile

In [4]:
batch_size = 128

In [6]:
import json
import random
import pathlib

expt_name = "expt3"

expt_setup = json.loads(
    pathlib.Path(
        f'../datasets/inst-templates/{expt_name}/expt-setup.json'
    ).read_bytes()
)

raw_user_profiles = expt_setup['user_profiles']

In [5]:
# These are a list of pairings of the user's motivations and dispositions
user_profiles = []

for motivation in raw_user_profiles:
    for disposition in motivation['dispositions']:
        user_profiles.append(
            (
                pathlib.Path(motivation['path']).read_text(),
                pathlib.Path(disposition['path']).read_text()
            )
        )

user_profiles = random.choices(user_profiles, k=batch_size)

### Fetch instruction template

In [6]:
inst_path = pathlib.Path(f'../datasets/inst-templates/{expt_name}/conv-gen.txt')
inst = inst_path.read_text()

## Batch Generate Conversations

In [7]:
output_dir = pathlib.Path(f"../datasets/inst-templates/{expt_name}/conversations")

In [8]:
from utils import BatchTextGenerator

text_gen = BatchTextGenerator(expt_setup['llm'], load_in_8bit=False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
import json
from tqdm import tqdm


df_with_progbar = tqdm(df.groupby(df.index//batch_size), desc="Processing batch")

for batch_idx, batch_df in df_with_progbar:
    idxes = list(batch_df.index)
    tweet_ids = list(batch_df['tweet_id'])
    instructions = [
        inst.format(disp, motv, tweet)
            for (disp, motv), (_, _, tweet) in zip(
                user_profiles,
                batch_df.itertuples()
            )
    ]

    batch_op = text_gen(system_message, task_que, instructions)

    # Build output jsons
    for idx, id, conv, (_, _, tweet), (disp, motv) in zip(
        idxes, tweet_ids, batch_op,
        batch_df.itertuples(),
        user_profiles
    ):
        conv_json = {
            'tweet_id': id,
            'disposition': disp,
            'motivation': motv,
            'tweet': tweet,
            'conv': conv
        }

        conv_file = output_dir.joinpath(f'{idx}.json')
        conv_file.write_text(json.dumps(conv_json))


Processing batch: 100%|██████████| 1/1 [01:12<00:00, 72.18s/it]
