In [20]:
import warnings
import csv
import os
import sys, json_repair
import pandas as pd
from typing import List
warnings.filterwarnings("ignore")
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser


In [21]:
def write_csv(output_file, data, mode='a'):
    with open(output_file, mode, newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(data)

In [22]:
def write_txt(txt_path, data):
    with open(txt_path, 'a') as txf:
        txf.write(data)

In [23]:
#variables

model_name = 'gpt-4-turbo-2024-04-09'  
title = 'Consumer Survey'  
audience = 'Survey to people about their dog' 
output_file = 'dallasRun-test.csv'

questions = [
"How old is your dog (in # of years)?",
"What is your dog's primary breed?",
"Approximately how much does your dog weigh (in pounds)?",
"What is your dog's name?",
"What was the inspiration for that name?",
"Did you adopt your dog? (yes/no)",
"How old was your dog when you got him/her? Provide estimated number of months old.",
"How did you feel the first time you met your dog?",
"How has your dog changed your outlook on life?",
]


In [24]:
# This block of code is not needed, it is there to know how function calling works.

# To learn more about function calling, check this out https://www.datacamp.com/tutorial/open-ai-function-calling-tutorial
#Answer is the class which contains response given by each user, it also contain traits to know which person answered the questions.
#llm is inteligent enough to give unique response each time based on the given system and user prompt, we should not worry about it.
class Answer(BaseModel):
    """Human-like set of answers told by each person while being surveyed. 
    Each person should have their own unique set of answers that should be different from other person's set of answers."""

    answer1: str = Field(description="This is the human like answer to Question1.")
    answer2: str = Field(description="This is the human like answer to Question2.")
    answer3: str = Field(description="This is the human like answer to Question3.")
    answer4: str = Field(description="This is the human like answer to Question4.")
    
    
    
#list of answers, the formatted list that the llm model will generate which contains dictionaries of Answer class
#llm model will make sure that each dictionary in the list is unique from each other as it will treat the list like list of human responses.
class Answers(BaseModel):
    """Human-like survey answers to tell user."""

    answer: List[Answer]

In [25]:
# This is standard pattern of list of functions used for function calling.
# To know the pattern, you can print out openai_functions variable in previous New.ipynb file
# This function is responsible to give the output list of answers of each human that we want, based on the parameters provided.
# To learn more about function calling, check this out https://www.datacamp.com/tutorial/open-ai-function-calling-tutorial
def create_openai_functions(questions: List):
    fn  = [{'name': 'Answers',
            'description': 'Human-like survey answers to tell user.',
            'parameters': {'type': 'object',
            'properties': {'answer': {'type': 'array',
                'items': {'description': "Human-like set of answers told by each person while being surveyed. \nEach person should have their own unique set of answers that should be different from other person's set of answers.",
                'type': 'object',
                'properties': {
                },
                'required': []}}},
            'required': ['answer']}}]
    answers = fn[0]['parameters']['properties']['answer']['items']['properties']
    required = fn[0]['parameters']['properties']['answer']['items']['required']
    for ind, _ in enumerate(questions, 1):
        answers[f'answer{ind}'] = {'description': f'This is the human like answer to Question{ind}.', 'type': 'string'}
        required.append(f'answer{ind}')
    return fn
        

In [26]:
#sample openai function example
openai_functions = create_openai_functions(questions)

In [27]:
parser = JsonKeyOutputFunctionsParser(key_name="answer")  # to parse the output, it will return list of answers based on function calling

In [28]:
#bind openai_functions to llm for function calling
llm = ChatOpenAI(
    temperature=0.7,
    model_name=model_name,
    max_tokens=4095
).bind(functions=openai_functions)

In [29]:
system_message = """

Assume you are a group of Women, Age 45-60 when answering all of the following questions

Directly answer each question in the first person, providing the requested information, without additional context unless it’s directly relevant. Your responses should embody the individual with the traits provided, showcasing a natural human variability in answering.

- Be sure that you answer every single question. Do not miss answering any question.

- When addressing gender and the traits are not clear, choose 'male' or 'female' based on the context provided by the traits. If traits suggest a higher likelihood for one gender over the other, use that as a basis for your choice.

- Maintain consciousness and avoid AI-like language. The responses should feel personal, as if they are derived from the individual's own experiences and knowledge.

- Introduce variety in your responses to mirror the natural differences in human behavior and decision-making. No two humans are exactly alike, so your answers should reflect a range of possible reactions and thoughts.

- Humans do have things in common though so make sure that the results reflect that and have distributions like an expected survey would have.

- Use the context from previous answers for each person to inform your responses to other questions for that individual. 

- If a question is asked for which the answer is not directly provided by the traits, make an educated guess based on the available information and what is known about typical behaviors or preferences of people with similar traits.

- Do not include any other questions and answers in your response. Just give the answer to the question you are being asked.

-The aim is to craft responses that are individually tailored, exhibit the complexity and subtlety of human thought, and are coherent throughout the survey, as if the answers are coming from one person with a consistent set of traits and experiences.

-Ensure the answers are short and match the output asked for in the questions.
If the output asks for a numerical output just give the number. For example if you 
are asked how old are you just give the number and don't have any text before or after that unless prompted to do so.

If the output asks for a monetary output, give just the dollar amount and no text before or after it unless prompted to do so. So for example if the answer is 
$40 just give $40.

If the output asks for a decimal output, give just the value and no text before or after it unless prompted to do so. So for example if the answer is 
40.5 just give 40.5.

If you are asked a question like where do you live? Be sure to just answer the place that you live. No text before or after it is needed unless prompted to do so.

If the answer is asking for a percentage output just give the value. So if the answer is 45% just give 45%. Do not give any text before or after that unless prompted to do so.

"""





In [30]:
#chat template
prompt = ChatPromptTemplate.from_messages(
    [("system", system_message), ("user", "{input}")]
)

In [31]:
chain = prompt | llm 

In [32]:
def output_parser(response):
    try:
        return eval(response.additional_kwargs['function_call']['arguments'])['answer']
    except Exception as e:
        # print("Error in Eval\n")
        # print(e)
        # print('----------------------------------------------------------------------')
        try:
            return json_repair.loads(response.additional_kwargs['function_call']['arguments'])['answer']
        except Exception as e:
            # print("Error in Json loads")
            # print(e)
            # print('----------------------------------------------------------------------')
            # write_txt(txt_file, "\nFailed-eval-json-loads---------------------------------------\n\n\n")
            return None

In [35]:
def get_output_token(response):
    try:
        return int(response.response_metadata['token_usage']['completion_tokens'])
    except Exception as e:
        # print("Error in Eval\n")
        # print(e)
        # print('----------------------------------------------------------------------')
        try:
            return int(response.response_metadata['token_usage']['completion_tokens'])
        except Exception as e:
            # print("Error in Json loads")
            # print(e)
            # print('----------------------------------------------------------------------')
            # write_txt(txt_file, "\nFailed-eval-json-loads---------------------------------------\n\n\n")
            return None

In [36]:
traits_and_counts = [
    ('Women Aged 45-60 and own 1 dog', 100),
]


# from transformers import GPT2Tokenizer

# def count_tokens(text):
#     # Load the tokenizer
#     tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

#     # Tokenize the text and count the tokens
#     tokens = tokenizer.encode(text)
#     return len(tokens)




test_message = ''
test_message = f"Generate survey answers from 1 person.\nAll of the surveyed people are {audience}\n"
test_message += f'The title of the survey: {title}\n'
test_message += f'1 of the surveyed people have this trait: ' + traits_and_counts[0][0] + ''
test_message += 'Please answer all questions below:\n'

for question in questions:
    test_message += f'{question}\n'

TestRes = chain.invoke({"input": test_message})
responseTest = output_parser(TestRes)
total_tokens = get_output_token(TestRes)

# Concatenate all answers into a single string
# all_answers_text = " ".join(answer for response in responseTest for answer in response.values())

# # Count tokens in the concatenated text
# total_tokens = count_tokens(all_answers_text)
print("Total tokens:", total_tokens)



# You can now use this token count to calculate how many responses fit within a specific token budget
tokens_per_batch = 3900
max_responses_per_batch = tokens_per_batch // int(total_tokens)
print(f"Maximum responses per batch based on token usage: {max_responses_per_batch}")


counts = 0
total_processed_row = 0
processed_this_batch=0
person_number=0
output_file = 'dallasRunTest.csv'



traits_dict = dict(traits_and_counts)


if not os.path.exists(output_file):
    headers = ['Traits', 'Person Number'] + questions
    write_csv(output_file, headers, mode='w')


for traits, counts in traits_dict.items():
    if counts <= 0:
        continue

    while counts > 0:
        current_batch_size = min(max_responses_per_batch, counts)
        print(f"Processing batch of {current_batch_size} responses for trait: {traits}")

        try:
            input_message = f"Generate survey answers from {current_batch_size} people.\nAll of the surveyed people are {audience}\n"
            input_message += f'The title of the survey: {title}\n'
            input_message += f'{current_batch_size} of the surveyed people have this trait: {traits}\n'
            input_message += 'Please answer all questions below:\n'
            for question in questions:
                input_message += f'{question}\n'

            res = chain.invoke({"input": input_message})
            responses = output_parser(res)
            
            if responses is None or not responses:
                print("No valid responses or error occurred")
                continue

            processed_this_batch = len(responses)
            counts -= processed_this_batch  # Decrement counts by the number of processed responses
            
            for response in responses:
                person_number += 1
                response_data = [traits, person_number]
                # Using an empty string as the default value for missing answers
                response_data.extend(response.get(f'answer{i+1}', '') for i in range(len(questions)))
                write_csv(output_file, response_data)

            total_processed_row += processed_this_batch

        except Exception as e:
            print(f'OpenAI Error:', e)
            continue

        print(f"Batch complete. {len(responses)} responses processed.")
        print(f'Remaining counts for {traits}: {counts}')
        print("Total responses given by model:", processed_this_batch)
        print("Total processed responses:", total_processed_row)

        if counts <= 0:
            break


content='' additional_kwargs={'function_call': {'arguments': '{"answer":[{"answer1":"7","answer2":"Labrador Retriever","answer3":"75","answer4":"Buddy","answer5":"He just looked like a Buddy to us; it fit perfectly.","answer6":"yes","answer7":"8","answer8":"I felt an instant connection and immense joy.","answer9":"He\'s taught me the value of unconditional love and patience."}]}', 'name': 'Answers'}} response_metadata={'token_usage': {'completion_tokens': 92, 'prompt_tokens': 971, 'total_tokens': 1063}, 'model_name': 'gpt-4-turbo-2024-04-09', 'system_fingerprint': 'fp_76f018034d', 'finish_reason': 'function_call', 'logprobs': None} id='run-e6ad44aa-3c72-4b97-a0a6-3fa5b7d6ee76-0'
Total tokens: 92
content='' additional_kwargs={'function_call': {'arguments': '{"answer":[{"answer1":"7","answer2":"Labrador Retriever","answer3":"75","answer4":"Buddy","answer5":"He just looked like a Buddy to us; it fit perfectly.","answer6":"yes","answer7":"8","answer8":"I felt an instant connection and imme