In [1]:
import warnings, csv, os, sys, json
import pandas as pd
from typing import List
warnings.filterwarnings("ignore")
from langchain_openai import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain_core.prompts import (
    ChatPromptTemplate
)
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

In [2]:
def write_csv(output_file, data):
    with open(output_file, 'a', newline='', encoding='utf-8') as outfile:  
        writer = csv.writer(outfile)
        writer.writerow(data)

In [3]:
def write_txt(txt_path, data):
    with open(txt_path, 'a') as txf:
        txf.write(data)

In [4]:
#variables

model_name = 'gpt-4-1106-preview'  #gpt model name
audience = ' US owners of 1 dog'  #audience
output_file = 'runNew.csv'
txt_file = 'output.txt'

#you can change questions and traits/count as you want, this part has become dynamic in this notebook.
questions = [
"How old is your dog (in # of years)?",
"What is your dog's primary breed?",
"Approximately how much does your dog weigh (in pounds)?",
"What is your dog's name?",
"What was the inspiration for that name?",
"What is the most common nickname (if any) that you use for your dog?",
"When did you get your dog? (Before 2020, In 2020, After 2020)",
"Did you adopt your dog? (yes/no)",
"How old was your dog when you got him/her? Provide estimated number of months old.",
"How did you feel the first time you met your dog?",
"How has your dog changed your outlook on life?",
"On a scale of 1-5, where 1 is not at all important and 5 is very important, how important is your dog's happiness to your overall well-being?",
"Choose the term that best describes your dog's relationship with you.(Best Friend,Loyal companion,Family Member,Protector,None of these apply)",
"Why did you choose that term?",
"What is the most recent command or trick your dog has learned, and how long did it take to train?",
"How many hours a day does your dog typically sleep and where does your dog usually sleep in your home?",
"Have you shopped for your dog online in the past year? (yes/no)",
"Over the past year, approximately how much did you spend on your dog? Provide total $ amount.",
"Over the past year, approximately what percent of the money you spent on your dog was paid to your veterinarian? Choose a percent from 0% to 100%.",
"Over the past year, approximately what percent of the money you spent on your dog was on dog food? Choose a percent from 0% to 100%.",
"In March, approximately much money did you spend on your dog? Provide estimated total $ amount.",
"How did the amount of money you spent on your dog in March compare with February?(Spent about the same in March,I'm not sure,Spent more in March,Spent less in March)",
"How often do you purchase dog food? (Bi-weekly,Monthly,Less frequently than monthly,Weekly)",
"Do you primarily feed your dog dry food, wet food, or a mix?",
"Which of the following statements best describes your level of familiarity of your dog's primary food brand?(I know a lot about the brand,I know just the basics about the brand,I'm not very familiar with the brand)",
"On a scale of 1-5, where 1 is not at all important and 5 is very important, how important is the following in choosing your primary dog food brand? Quality of ingredients",
"Price",
"Nutritional value of the dog food",
"Brand reputation",
"How your dog likes the taste",
"Your veterinarian's recommendation",
"Special dietary needs",
"What dog food brand do you give your dog the most?",
"What are the main 2-3 reasons you choose this brand?",
"On a scale of 1-5, where 1 is not at all satisfied and 5 is very satisfied, how satisfied are you with the following for your primary dog food brand? Quality of ingredients",
"Price",
"Nutritional value of the dog food",
"Brand reputation",
"How your dog likes the taste",
"Your veterinarian's recommendation",
"Special dietary needs",
"How many times a day does your dog typically go out for a walk?(0,1,2,3,4,5,more than 5,not sure)",
"How (if at all) have recent events affected how you interact with your dog?(No change,Not sure,I interact more,I interact less)",
"What events are affecting your interaction level most?",
"Are you planning to watch the Summer Olympics with your dog?(No,Yes,Not Sure)",
"Are you planning to travel this Spring with your dog?(No,Yes,Not Sure)",
"If planning to travel this Spring with your dog, which is your preferred method of transportation? Select the best option.(Bus,Car,N/A-I'm not planning to travel with my dog this Spring,Other)",
"What is your gender?",
"What is your age?",
"How many people, including yourself, live in your household?(1,2,3,5 or more)",
"What is the highest level of education you have completed?(Associate degree, Bachelor's degree,Graduate degree, High school diploma or equivalent, Some college, no degree)",
"What is your current employment status?(Employed full-time,Employed part-time,Retired,Self-employed,Unable to work,Unemployed)",
"What is your total annual household income before taxes?($100,000 to $149,000,$150,000 or more,$25,000 to $49,999,$50,000 to $74,999, Less than $25,000, Prefer not to say)",
"In what city do you currently reside?",
"In what state do you currently reside?"
]

traits_and_counts = [
    ('Men aged 60+', 100)
]


In [5]:
# questions = [
#     'How happy are you with your life on a scale of 1-5 where 1 is very unhappy and 5 is very happy? (#)',
#     'What are the few tech tools that you use daily?',
#     'What is your profession?',
# ]

# traits_and_counts = [
#     ('Men being currently employed', 4),
#     ('Women being currently employed', 3)
# ]


In [6]:
# This block of code is not needed, it is there to know how function calling works.

# To learn more about function calling, check this out https://www.datacamp.com/tutorial/open-ai-function-calling-tutorial
#Answer is the class which contains response given by each user, it also contain traits to know which person answered the questions.
#llm is inteligent enough to give unique response each time based on the given system and user prompt, we should not worry about it.
class Answer(BaseModel):
    """Human-like set of answers told by each person while being surveyed. 
    Each person should have their own unique set of answers that should be different from other person's set of answers."""

    answer1: str = Field(description="This is the human like answer to Question1.")
    answer2: str = Field(description="This is the human like answer to Question2.")
    answer3: str = Field(description="This is the human like answer to Question3.")
    answer4: str = Field(description="This is the human like answer to Question4.")
    traits: str = Field(description="This is the trait of the person.")
    
    
    
#list of answers, the formatted list that the llm model will generate which contains dictionaries of Answer class
#llm model will make sure that each dictionary in the list is unique from each other as it will treat the list like list of human responses.
class Answers(BaseModel):
    """Human-like survey answers to tell user."""

    answer: List[Answer]

In [7]:
# This is standard pattern of list of functions used for function calling.
# To know the pattern, you can print out openai_functions variable in previous New.ipynb file
# This function is responsible to give the output list of answers of each human that we want, based on the parameters provided.
# To learn more about function calling, check this out https://www.datacamp.com/tutorial/open-ai-function-calling-tutorial
def create_openai_functions(questions: List):
    fn  = [{'name': 'Answers',
            'description': 'Human-like survey answers to tell user.',
            'parameters': {'type': 'object',
            'properties': {'answer': {'type': 'array',
                'items': {'description': "Human-like set of answers told by each person while being surveyed. \nEach person should have their own unique set of answers that should be different from other person's set of answers.",
                'type': 'object',
                'properties': {
                'traits': {'description': 'This is the trait of the person.',
                    'type': 'string'}},
                'required': ['traits']}}},
            'required': ['answer']}}]
    answers = fn[0]['parameters']['properties']['answer']['items']['properties']
    required = fn[0]['parameters']['properties']['answer']['items']['required']
    for ind, _ in enumerate(questions, 1):
        answers[f'answer{ind}'] = {'description': f'This is the human like answer to Question{ind}.', 'type': 'string'}
        required.append(f'answer{ind}')
    return fn
        

In [8]:
#sample openai function example
openai_functions = create_openai_functions(questions)

In [9]:
parser = JsonKeyOutputFunctionsParser(key_name="answer")  # to parse the output, it will return list of answers based on function calling

In [10]:
#bind openai_functions to llm for function calling
llm = ChatOpenAI(
    temperature=0.9,
    model_name=model_name
).bind(functions=openai_functions)

In [11]:
#this is default system message, you can change it based on your need.
system_message = """
You are a survey answering bot that generates answers like a survey when questions are asked. The answer should be made as if you are a human. 
Give answers assuming you are a new human with different lifestyles while giving answers.
Give answers with new thoughts, new ideas, new moods, you can also choose to answer very rudely, but the main idea is to be random, do not try to repeat same answers since humans have different answers with different answer style.
While giving answers, you should be as creative as possible and you should deviate your answers as much as possible from previous answers.
In every answer, change styles of answers, change average sentence lengths of answer, change fk_grade_level of sentences of answer. Make it different from previous answers. But also make sure it is the answer given by a human. So, don't make it seem like it is AI generated. Add both simple and fancy words.
In 1 answer, give your answers assuming you are having a worse life, in another answer, give your answers assuming you are having best life. like this, keep on changing the lifestyle of human that you are.
Avoid same repeated answers as much as possible.
Do no repeat same pattern in each answers. Give short answers sometimes and sometimes long answers, be random.
Since, human can give both positive and negative answers, you should follow the same principles.
Your answers should be descriptive just like human answers.
Each set of answers should be different from another set of answers. 
If you are asked about 'top few things' or 'few things', each answers should have random number of comma separated sentences. For example:
sentence1, sentence2, and sentence3. (3 sentences)
sentence1. (1 sentence)
sentence1, sentence2, sentence3, sentence4, and sentence5. (5 sentences)
sentence1, sentence2. (2 sentences)
sentence1, sentence2, sentence3, and sentence4. (4 sentences)


If the output asks for a monetary output give just the dollar amount and no text before or after it unless prompted to do so. So for example if the answer is $40 just give $40.
If the output asks for a decimal output give just the value and no text before or after it unless prompted to do so. So for example if the answer is 40.5 just give 40.5.
If you are asked a question like where do you live? Be sure to just answer the place that you live. No text before or after it is needed unless prompted to do so.
If the answer is asking for a percentage output just give the value. So if the answer is 45% just give 45%. Do not give any text before or after that unless prompted to do so.
"""

In [12]:
#chat template
prompt = ChatPromptTemplate.from_messages(
    [("system", system_message), ("user", "{input}")]
)

In [13]:
#chain using prompt, llm and parser
#it uses the prompt with llm and generate the answer based on the parser

# chain = prompt | llm | parser

chain = prompt | llm 

In [14]:
def output_parser(response):
    try:
        return eval(response.additional_kwargs['function_call']['arguments'])['answer']
    except Exception as e:
        # print("Error in Eval\n")
        # print(e)
        # print('----------------------------------------------------------------------')
        try:
            return json.loads(response.additional_kwargs['function_call']['arguments'])['answer']
        except Exception as e:
            # print("Error in Json loads")
            # print(e)
            # print('----------------------------------------------------------------------')
            write_csv(txt_file, "\n-Failed-eval-json-loads---------------------------------------\n")
            return None

In [15]:
total_processed_row = 0  #total row processed
total_remaining_surveyed = 0  #total people remaining to be surveyed
person_number = 0  #person count
traits_dict = {}


for dta in traits_and_counts:
    traits_dict[dta[0]] = int(dta[1])  #creating dict of traits and counts
    total_remaining_surveyed += int(dta[1])  #total people remaining to be surveyed
        
if output_file not in os.listdir():  #if csv is not available
    write_csv(output_file, ['Traits', 'Person Number']+questions)
else: #if csv already exists
    df = pd.read_csv(output_file)
    old_traits_dict = df['Traits'].value_counts().to_dict() #get traits/counts of already surveyed people in csv
    for k,v in old_traits_dict.items():
        person_number += v #increasing the person number
        if k not in traits_dict.keys(): 
            continue
        traits_dict[k] = traits_dict[k]-v
        total_remaining_surveyed = int(total_remaining_surveyed - v)  #subtracting from the people that are already in csv. suppose 7 needed, 3 are already in csv, it will only process 4
        
        
print("Remaining Traits/counts dict:", traits_dict)
print("Total remaining people to be surveyed:", total_remaining_surveyed)
print()

if total_remaining_surveyed <= 0:
    print("All the people are surveyed, if you want to increase survey count, increase the count in traits_and_counts variable at top")
    sys.exit()
    
    
while True:
    total_surveyed = sum([counts for _, counts in traits_dict.items()])
    input_message = f"Generate survey answers from {total_surveyed} people.\nAll of the surveyed people are {audience}\n"
    for traits, counts in traits_dict.items():
        if counts <= 0:
            continue
        input_message += f'{counts} of the surveyed people have this trait: {traits}\n'
    input_message += f'Extremely Important Note: You must compulsory give answers to all the questions provided below. Do not skip any questions.\n'
    for ind, question in enumerate(questions, 1):
        input_message += f'Question{ind}: {questions[ind-1]}\n'
    try:
        res = chain.invoke({"input": input_message})
    except Exception as e:
        print('OpenAI Error', e)
        print()
        print()
        continue
    write_csv(txt_file, res)
    responses = output_parser(res)
    if responses is None:
        continue
    final_responses = []
    for data in responses:
        continue_for = False
        if type(data) == dict:
            try:
                data['traits']
            except:
                write_csv(txt_file, "\n-Traits-not-found---------------------------------------\n")
                continue
            for ind, question in enumerate(questions):
                try:
                    data[f'answer{ind+1}']
                except:
                    write_csv(txt_file, f"\n-answer{ind+1}-not-found---------------------------------------\n")
                    continue_for = True
                    break
            if continue_for:
                continue
            try:
                traits_dict[data['traits']] = traits_dict[data['traits']]-1
            except:
                for kk in traits_dict.keys():
                    if kk in data['traits']:
                        traits_dict[kk] = traits_dict[kk]-1
                        data['traits'] = kk
                        break
                else:
                    write_csv(txt_file, "\n-Traits-not-matched---------------------------------------\n")
                    continue
            final_responses.append(data)
    for data in final_responses:
        person_number += 1
        lst = [data['traits'], person_number]
        for ind, question in enumerate(questions):
            lst.append(data[f'answer{ind+1}'])
        write_csv(output_file, lst)
    write_csv(txt_file, "\n-Completed---------------------------------------\n")
    total_processed_row += len(final_responses)
    print('Remaining traits:', traits_dict)
    print("Total responses given by model:", len(final_responses))
    print("Total processed responses:", total_processed_row)
    print(final_responses)
    print()
    print()
    if total_processed_row >= total_remaining_surveyed:
        break
    

Remaining Traits/counts dict: {'Men aged 60+': 100}
Total remaining people to be surveyed: 100

Remaining traits: {'Men aged 60+': 95}
Total responses given by model: 5
Total processed responses: 5
[{'traits': 'Men aged 60+', 'answer1': '9', 'answer2': 'Golden Retriever', 'answer3': '85', 'answer4': 'Buddy', 'answer5': "He looks just like the dog from the movie 'Air Bud'", 'answer6': 'Bud', 'answer7': 'Before 2020', 'answer8': 'yes', 'answer9': '8', 'answer10': 'It was love at first sight', 'answer11': "He's shown me the joy of taking life at a slower, more appreciative pace.", 'answer12': '5', 'answer13': 'Best Friend', 'answer14': "Because he's always there for me, through thick and thin.", 'answer15': 'Shake hands, took about two weeks', 'answer16': 'Approximately 14 hours, in a basket by the fireplace', 'answer17': 'yes', 'answer18': '1200', 'answer19': '30%', 'answer20': '40%', 'answer21': '100', 'answer22': 'Spent about the same in March', 'answer23': 'Bi-weekly', 'answer24': 'Dr

KeyError: 'answer16'