## Data Setup

In [None]:
task_que = '(conversation)'
# system_message = "You are an expert dialog writer. You write compelling conversations with characters that have rich, well developed personalities."
system_message = ""

### Fetch Tweets

In [None]:
import pandas as pd

data_path = "../datasets/sample-tweets.csv"
df = pd.read_csv(data_path)#, delimiter='\t')

### Fetch User Disposition

In [None]:
batch_size = 128

In [None]:
import random
import pathlib

expt_name = "expt2_2"

# Choosing the misinformed person's disposition
disposition_dir = pathlib.Path(f'../datasets/inst-templates/{expt_name}/dispositions')
dispositions = random.choices(sorted(disposition_dir.glob('*.txt')), k=batch_size)
dispositions = [disp.read_text() for disp in dispositions]

# Choosing the misinformed person's motivation
motivation_dir = pathlib.Path(f'../datasets/inst-templates/{expt_name}/motivations')
motivations = random.choices(sorted(motivation_dir.glob('*.txt')), k=batch_size)
motivations = [motv.read_text() for motv in motivations]

# Defining convo resolution:
CONV_RESOLUTION = ['resolved', 'unresolved']

### Fetch instruction template

In [None]:
inst_path = pathlib.Path(f'../datasets/inst-templates/{expt_name}/conv-gen.txt')
inst = inst_path.read_text()

## Batch Generate Conversations

In [None]:
output_dir = pathlib.Path(f"../datasets/inst-templates/{expt_name}/conversations")

In [None]:
from utils import BatchTextGenerator

text_gen = BatchTextGenerator('mistralai/Mistral-7B-Instruct-v0.1', load_in_8bit=False)

In [None]:
import json
from tqdm import tqdm


df_with_progbar = tqdm(df.groupby(df.index//batch_size), desc="Processing batch")

for batch_idx, batch_df in df_with_progbar:
    idxes = list(batch_df.index)
    tweet_ids = list(batch_df['tweet_id'])
    instructions = [
        inst.format(disp, motv, tweet)
            for disp, motv, (_, _, tweet) in zip(
                dispositions,
                motivations,
                batch_df.itertuples()
            )
    ]

    batch_op = text_gen(system_message, task_que, instructions)

    # Build output jsons
    for idx, id, conv, (_, _, tweet), disp, motv in zip(
        idxes, tweet_ids, batch_op,
        batch_df.itertuples(),
        dispositions, motivations
    ):
        conv_json = {
            'tweet_id': id,
            'disposition': disp,
            'motivation': motv,
            'tweet': tweet,
            'conv': conv
        }

        conv_file = output_dir.joinpath(f'{idx}.json')
        conv_file.write_text(json.dumps(conv_json))
