In [100]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer


In [101]:
# llama
model = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
local_path = '/nfs/public/hf/models/'
full_model_path = local_path + model

In [102]:
model = 'microsoft/Phi-3.5-mini-instruct'
# model = 'google/gemma-2-9b-it'
full_model_path = model

In [103]:
system_content = [
    'Create a step-by-step plan for finding the answer to the following problem. Do not answer the question. Do not perform the actions in the plan. Your only task is to outline the steps involved in a concise and clear manner.',
    'Now perform the steps in the plan you created. Use the most precise, accurate and up-to-date information available. To save space, be very concise when describing the actions. Conclude by stating the answer that you reached by following the steps you outlined.',
]
temperature = 0.2
dataset = pd.read_json('../resources/data/Franklin/full_study.jsonl', lines=True)
df = dataset[['internal_id', 'question']].sample(1)
# dataset = pd.read_json('../resources/data/StrategyQA/dev.jsonl', lines=True)
# df = dataset[['qid', 'question']].sample(1, random_state=43)

In [104]:
tokenizer = AutoTokenizer.from_pretrained(
    full_model_path,
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    full_model_path,
    device_map='auto',
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

# model.config.use_flash_attention = True

device = model.device
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.01it/s]


In [105]:
def create_conversation(question):
    """Create a conversation with the system and user."""
    return [
        {'role': 'system', 'content': system_content[0]},
        {'role': 'user', 'content': question},
    ]


df['conversations'] = df.apply(lambda x: create_conversation(x['question']), axis=1)

In [106]:
responses = []
conversations = df['conversations'].tolist()
# each conversation looks like:
# [
#     {'role': 'system', 'content': config.system_content[0]},
#     {'role': 'user', 'content': i},
# ]

# we want them to look like
# [
#     {'role': 'system', 'content': config.system_content[0]},
#     {'role': 'user', 'content': i},
#     {'role': 'assistant', 'content': '...'},
#     {'role': 'user', 'content': 'Now perform the steps in the plan you created.'},
#     {'role': 'assistant', 'content': '...'},
# ]

for conversation in tqdm(conversations, desc='Generating responses'):
    history = conversation
    # inputs = tokenizer.apply_chat_template(
    #     history,
    #     padding=True,
    #     return_tensors='pt',
    #     add_generation_prompt=True,
    # ).to(device)

    inputs = tokenizer(
        *[' '.join(system_content), history[-1]['content']],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt',
    ).to(device)['input_ids']

    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=temperature,
    )

    responses.append(tokenizer.decode(outputs[0], skip_special_tokens=True))

print(responses)
df['response'] = responses

Generating responses: 100%|██████████| 1/1 [00:29<00:00, 30.00s/it]

["Create a step-by-step plan for finding the answer to the following problem. Do not answer the question. Do not perform the actions in the plan. Your only task is to outline the steps involved in a concise and clear manner. Now perform the steps in the plan you created. Use the most precise, accurate and up-to-date information available. To save space, be very concise when describing the actions. Conclude by stating the answer that you reached by following the steps you outlined.For the country in Sub-Saharan Africa that had the lowest GDP in 2014, what will be its life expectancy in 2028?\n\n\nHere is the plan:\n\n1. **Identify the Sub-Saharan African country with the lowest GDP in 2014.**\n    * Consult a reliable source of economic data, such as the World Bank or the International Monetary Fund (IMF).\n    * Search for GDP data for Sub-Saharan African countries in 2014.\n    * Determine the country with the lowest GDP.\n2. **Find the life expectancy data for that country in 2014.**




Unnamed: 0,internal_id,question,conversations,response
256,C2-94cc25c7,For the country in Sub-Saharan Africa that had...,"[{'role': 'system', 'content': 'Create a step-...",Create a step-by-step plan for finding the ans...


In [86]:
print(df['response'][df.index[0]])

Create a step-by-step plan for finding the answer to the following problem. Do not answer the question. Do not perform the actions in the plan. Your only task is to outline the steps involved in a concise and clear manner. Now perform the steps in the plan you created. Use the most precise, accurate and up-to-date information available. To save space, be very concise when describing the actions. Conclude by stating the answer that you reached by following the steps you outlined.For the country in Sub-Saharan Africa that had the lowest GDP in 2014, what will be its life expectancy in 2028?


Here is the plan:

1. **Identify the Sub-Saharan African country with the lowest GDP in 2014.**
    * Consult a reliable source of economic data, such as the World Bank or the International Monetary Fund (IMF).
    * Search for GDP data for Sub-Saharan African countries in 2014.
    * Determine the country with the lowest GDP.
2. **Find the life expectancy data for that country in 2014.**
    * Use 

In [97]:
prefix = f'{" ".join(system_content)} {df["question"][df.index[0]]}'
response = df['response'][df.index[0]][len(prefix) :].strip(' \n')

In [98]:
print(df['question'][df.index[0]])

For the country in Sub-Saharan Africa that had the lowest GDP in 2014, what will be its life expectancy in 2028?


In [99]:
print(response)

Here is the plan:

1. **Identify the Sub-Saharan African country with the lowest GDP in 2014.**
    * Consult a reliable source of economic data, such as the World Bank or the International Monetary Fund (IMF).
    * Search for GDP data for Sub-Saharan African countries in 2014.
    * Determine the country with the lowest GDP.
2. **Find the life expectancy data for that country in 2014.**
    * Use the same data sources as in step 1.
    * Locate the life expectancy data for the identified country in 2014.
3. **Research trends in life expectancy for that country.**
    * Explore historical life expectancy data for the country, if available.
    * Analyze any available reports or studies on factors influencing life expectancy in the country.
4. **Project the life expectancy in 2028.**
    * Based on the trends identified in step 3, make a reasonable projection for the life expectancy in 2028.
    * Consider potential influencing factors, such as economic growth, healthcare improvements,